This work is sponsored by, and copyright, Google.
This avoids having to fill the temp buffer with zeros for the
skipped slices, and leads to slightly more straightforward code
for these cases (for the 16x16 case, where the special case pass functions
are written out instead of templated from the same macro), instead of
riddling the common code with special case branches or macro .ifs.
The code size increases from 15000 bytes to 19864 bytes.
Before:
vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5 188.7 211.7 235.1
vp9_inv_dct_dct_16x16_sub4_add_neon: 1336.5 1012.5 1225.9 860.7
vp9_inv_dct_dct_16x16_sub8_add_neon: 2023.2 1768.8 1868.1 1358.0
vp9_inv_dct_dct_16x16_sub12_add_neon: 2947.1 2228.9 2304.8 1795.7
vp9_inv_dct_dct_16x16_sub16_add_neon: 3247.9 2536.7 2547.0 2036.1
vp9_inv_dct_dct_32x32_sub1_add_neon: 751.5 456.7 863.5 553.9
vp9_inv_dct_dct_32x32_sub4_add_neon: 8019.6 5868.0 6632.6 5134.4
vp9_inv_dct_dct_32x32_sub8_add_neon: 8808.1 6966.8 7198.0 5690.6
vp9_inv_dct_dct_32x32_sub12_add_neon: 11291.5 10146.7 9628.8 7566.7
vp9_inv_dct_dct_32x32_sub16_add_neon: 12159.2 11004.2 10373.3 8237.7
vp9_inv_dct_dct_32x32_sub20_add_neon: 15230.9 13467.6 11841.1 9748.8
vp9_inv_dct_dct_32x32_sub24_add_neon: 16361.5 14854.5 12677.6 10505.0
vp9_inv_dct_dct_32x32_sub28_add_neon: 17497.8 15833.3 13493.0 11254.0
vp9_inv_dct_dct_32x32_sub32_add_neon: 18591.8 17348.5 14355.5 12001.7
After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5 188.7 211.7 235.1
vp9_inv_dct_dct_16x16_sub4_add_neon: 1209.5 863.9 1034.7 764.7
vp9_inv_dct_dct_16x16_sub8_add_neon: 1915.8 1590.9 1739.0 1281.7
vp9_inv_dct_dct_16x16_sub12_add_neon: 2850.5 2204.3 2292.1 1779.8
vp9_inv_dct_dct_16x16_sub16_add_neon: 3240.1 2490.6 2555.8 2009.9
vp9_inv_dct_dct_32x32_sub1_add_neon: 751.5 458.9 863.5 553.9
vp9_inv_dct_dct_32x32_sub4_add_neon: 7566.3 5721.3 6043.8 4920.7
vp9_inv_dct_dct_32x32_sub8_add_neon: 8366.1 6786.5 6594.1 5476.2
vp9_inv_dct_dct_32x32_sub12_add_neon: 10980.0 9885.5 9237.7 7436.5
vp9_inv_dct_dct_32x32_sub16_add_neon: 11917.3 11156.8 9963.0 8113.0
vp9_inv_dct_dct_32x32_sub20_add_neon: 15201.3 13632.9 11844.2 9819.9
vp9_inv_dct_dct_32x32_sub24_add_neon: 16333.8 14541.2 12654.5 10580.7
vp9_inv_dct_dct_32x32_sub28_add_neon: 17459.1 16165.8 13450.0 11325.3
vp9_inv_dct_dct_32x32_sub32_add_neon: 18612.2 17386.7 14281.6 12065.8
---
This reverts parts of the previous commit (changing some register uses to
another register); if both are to be applied, they should be applied
squashed together. (And similarly for review, it's much easier to squash the
two and review the end result.) They are presented sequentially as two steps,
to show the effect on runtime and code size of each alternative.
---
libavcodec/arm/vp9itxfm_neon.S | 532 +++++++++++++++++++++++++++++------------
1 file changed, 374 insertions(+), 158 deletions(-)
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 99a5e1f..b6c23c8 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -745,6 +745,42 @@ function iadst16
bx lr
endfunc
+.macro load_add_store coef0, coef1, coef2, coef3
+ vrshr.s16 \coef0, \coef0, #6
+ vrshr.s16 \coef1, \coef1, #6
+
+ vld1.32 {d4[]}, [r0,:32], r1
+ vld1.32 {d4[1]}, [r3,:32], r1
+ vrshr.s16 \coef2, \coef2, #6
+ vrshr.s16 \coef3, \coef3, #6
+ vld1.32 {d5[]}, [r0,:32], r1
+ vld1.32 {d5[1]}, [r3,:32], r1
+ vaddw.u8 \coef0, \coef0, d4
+ vld1.32 {d6[]}, [r0,:32], r1
+ vld1.32 {d6[1]}, [r3,:32], r1
+ vaddw.u8 \coef1, \coef1, d5
+ vld1.32 {d7[]}, [r0,:32], r1
+ vld1.32 {d7[1]}, [r3,:32], r1
+
+ vqmovun.s16 d4, \coef0
+ vqmovun.s16 d5, \coef1
+ sub r0, r0, r1, lsl #2
+ sub r3, r3, r1, lsl #2
+ vaddw.u8 \coef2, \coef2, d6
+ vaddw.u8 \coef3, \coef3, d7
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r3,:32], r1
+ vqmovun.s16 d6, \coef2
+ vst1.32 {d5[0]}, [r0,:32], r1
+ vst1.32 {d5[1]}, [r3,:32], r1
+ vqmovun.s16 d7, \coef3
+
+ vst1.32 {d6[0]}, [r0,:32], r1
+ vst1.32 {d6[1]}, [r3,:32], r1
+ vst1.32 {d7[0]}, [r0,:32], r1
+ vst1.32 {d7[1]}, [r3,:32], r1
+.endm
+
.macro itxfm16_1d_funcs txfm
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@ transpose into a horizontal 16x4 slice and store.
@@ -763,40 +799,13 @@ function \txfm\()16_1d_4x16_pass1_neon
mov r12, #32
vmov.s16 q2, #0
-
-.ifc \txfm,idct
- cmp r3, #10
- ble 3f
- cmp r3, #38
- ble 4f
-.endif
-
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
.endr
bl \txfm\()16
-.ifc \txfm,idct
- b 5f
-
-3:
-.irp i, 16, 17, 18, 19
- vld1.16 {d\i}, [r2,:64]
- vst1.16 {d4}, [r2,:64], r12
-.endr
- bl idct16_quarter
- b 5f
-
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- vld1.16 {d\i}, [r2,:64]
- vst1.16 {d4}, [r2,:64], r12
-.endr
- bl idct16_half
-.endif
-5:
@ Do four 4x4 transposes. Originally, d16-d31 contain the
@ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@ contain the transposed 4x4 blocks.
@@ -855,84 +864,26 @@ endfunc
@ r0 = dst
@ r1 = dst stride
@ r2 = src (temp buffer)
-@ r3 = eob
-@ r9 = slice offset
+@ r3 = slice offset
function \txfm\()16_1d_4x16_pass2_neon
push {lr}
mov r12, #32
-.ifc \txfm,idct
- cmp r3, #10
- ble 3f
- cmp r3, #38
- ble 4f
-.endif
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
vld1.16 {d\i}, [r2,:64], r12
.endr
- cmp r9, #0
+ cmp r3, #0
beq 1f
.irp i, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64], r12
.endr
1:
- bl \txfm\()16
-.ifc \txfm,idct
- b 5f
-3:
-.irp i, 16, 17, 18, 19
- vld1.16 {d\i}, [r2,:64], r12
-.endr
- bl idct16_quarter
- b 5f
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- vld1.16 {d\i}, [r2,:64], r12
-.endr
- bl idct16_half
-.endif
-
-5:
- add r8, r0, r1
+ add r3, r0, r1
lsl r1, r1, #1
-.macro load_add_store coef0, coef1, coef2, coef3
- vrshr.s16 \coef0, \coef0, #6
- vrshr.s16 \coef1, \coef1, #6
-
- vld1.32 {d4[]}, [r0,:32], r1
- vld1.32 {d4[1]}, [r8,:32], r1
- vrshr.s16 \coef2, \coef2, #6
- vrshr.s16 \coef3, \coef3, #6
- vld1.32 {d5[]}, [r0,:32], r1
- vld1.32 {d5[1]}, [r8,:32], r1
- vaddw.u8 \coef0, \coef0, d4
- vld1.32 {d6[]}, [r0,:32], r1
- vld1.32 {d6[1]}, [r8,:32], r1
- vaddw.u8 \coef1, \coef1, d5
- vld1.32 {d7[]}, [r0,:32], r1
- vld1.32 {d7[1]}, [r8,:32], r1
-
- vqmovun.s16 d4, \coef0
- vqmovun.s16 d5, \coef1
- sub r0, r0, r1, lsl #2
- sub r8, r8, r1, lsl #2
- vaddw.u8 \coef2, \coef2, d6
- vaddw.u8 \coef3, \coef3, d7
- vst1.32 {d4[0]}, [r0,:32], r1
- vst1.32 {d4[1]}, [r8,:32], r1
- vqmovun.s16 d6, \coef2
- vst1.32 {d5[0]}, [r0,:32], r1
- vst1.32 {d5[1]}, [r8,:32], r1
- vqmovun.s16 d7, \coef3
+ bl \txfm\()16
- vst1.32 {d6[0]}, [r0,:32], r1
- vst1.32 {d6[1]}, [r8,:32], r1
- vst1.32 {d7[0]}, [r0,:32], r1
- vst1.32 {d7[1]}, [r8,:32], r1
-.endm
load_add_store q8, q9, q10, q11
load_add_store q12, q13, q14, q15
-.purgem load_add_store
pop {pc}
endfunc
@@ -951,12 +902,15 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon,
export=1
.ifc \txfm1\()_\txfm2,idct_idct
cmp r3, #1
beq idct16x16_dc_add_neon
+ cmp r3, #10
+ ble idct16x16_quarter_add_neon
+ cmp r3, #38
+ ble idct16x16_half_add_neon
.endif
push {r4-r9,lr}
.ifnc \txfm1\()_\txfm2,idct_idct
vpush {q4-q7}
mov r9, #0
- mov r3, #256
.else
movrel r8, min_eob_idct_idct_16
.endif
@@ -994,7 +948,7 @@ A and r7, sp, #15
add r0, r4, #(\i)
mov r1, r5
add r2, sp, #(\i*2)
- mov r9, #\i
+ mov r3, #\i
bl \txfm2\()16_1d_4x16_pass2_neon
.endr
@@ -1012,6 +966,211 @@ itxfm_func16x16 idct, iadst
itxfm_func16x16 iadst, iadst
.ltorg
+function idct16_1d_4x16_pass1_quarter_neon
+ push {lr}
+ mov r12, #32
+ vmov.s16 q2, #0
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+
+ bl idct16_quarter
+
+ @ Do four 4x4 transposes. Originally, d16-d31 contain the
+ @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+ @ contain the transposed 4x4 blocks.
+ transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17,
d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ @ Store the transposed 4x4 blocks horizontally.
+ @ The first 4x4 block is kept in registers for the second pass,
+ @ store the rest in the temp buffer.
+ add r0, r0, #8
+.irp i, 20, 24, 28
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ add r0, r0, #8
+.irp i, 21, 25, 29
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ add r0, r0, #8
+.irp i, 22, 26, 30
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ add r0, r0, #8
+.irp i, 23, 27, 31
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ pop {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+ push {lr}
+ cmp r3, #0
+ mov r12, #32
+ beq 1f
+ @ Only load the top 4 lines, and only do it for the later slices.
+ @ For the first slice, d16-d19 is kept in registers from the first
pass.
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+1:
+
+ add r3, r0, r1
+ lsl r1, r1, #1
+ bl idct16_quarter
+
+ load_add_store q8, q9, q10, q11
+ load_add_store q12, q13, q14, q15
+
+ pop {pc}
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+ push {lr}
+ mov r12, #32
+ vmov.s16 q2, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+
+ bl idct16_half
+
+ @ Do four 4x4 transposes. Originally, d16-d31 contain the
+ @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+ @ contain the transposed 4x4 blocks.
+ transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17,
d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+ @ Store the transposed 4x4 blocks horizontally.
+ cmp r1, #4
+ beq 1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ pop {pc}
+1:
+ @ Special case: For the second input column (r1 == 4),
+ @ which would be stored as the second row in the temp buffer,
+ @ don't store the first 4x4 block, but keep it in registers
+ @ for the first slice of the second pass (where it is the
+ @ second 4x4 block).
+ add r0, r0, #8
+.irp i, 20, 24, 28
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ add r0, r0, #8
+.irp i, 21, 25, 29
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ add r0, r0, #8
+.irp i, 22, 26, 30
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ add r0, r0, #8
+.irp i, 23, 27, 31
+ vst1.16 {d\i}, [r0,:64]!
+.endr
+ vmov d20, d16
+ vmov d21, d17
+ vmov d22, d18
+ vmov d23, d19
+ pop {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+ push {lr}
+ mov r12, #32
+ cmp r3, #0
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+ beq 1f
+.irp i, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64], r12
+.endr
+1:
+
+ add r3, r0, r1
+ lsl r1, r1, #1
+ bl idct16_half
+
+ load_add_store q8, q9, q10, q11
+ load_add_store q12, q13, q14, q15
+
+ pop {pc}
+endfunc
+.purgem load_add_store
+
+function idct16x16_quarter_add_neon
+ push {r4-r9,lr}
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #512
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+ movrel r12, idct_coeffs
+ vld1.16 {q0-q1}, [r12,:128]
+
+.irp i, 0
+ add r0, sp, #(\i*32)
+ mov r1, #\i
+ add r2, r6, #(\i*2)
+ bl idct16_1d_4x16_pass1_quarter_neon
+.endr
+.irp i, 0, 4, 8, 12
+ add r0, r4, #(\i)
+ mov r1, r5
+ add r2, sp, #(\i*2)
+ mov r3, #\i
+ bl idct16_1d_4x16_pass2_quarter_neon
+.endr
+
+ add sp, sp, r7
+ pop {r4-r9,pc}
+endfunc
+
+function idct16x16_half_add_neon
+ push {r4-r9,lr}
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #512
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+ movrel r12, idct_coeffs
+ vld1.16 {q0-q1}, [r12,:128]
+
+.irp i, 0, 4
+ add r0, sp, #(\i*32)
+ mov r1, #\i
+ add r2, r6, #(\i*2)
+ bl idct16_1d_4x16_pass1_half_neon
+.endr
+.irp i, 0, 4, 8, 12
+ add r0, r4, #(\i)
+ mov r1, r5
+ add r2, sp, #(\i*2)
+ mov r3, #\i
+ bl idct16_1d_4x16_pass2_half_neon
+.endr
+
+ add sp, sp, r7
+ pop {r4-r9,pc}
+endfunc
function idct32x32_dc_add_neon
movrel r12, idct_coeffs
@@ -1198,6 +1357,7 @@ function idct32_odd_quarter
idct32_end
endfunc
+.macro idct32_funcs suffix
@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
@ We don't have register space to do a single pass IDCT of 4x32 though,
@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
@@ -1208,7 +1368,7 @@ endfunc
@ r1 = min eob
@ r2 = src
@ r3 = eob
-function idct32_1d_4x32_pass1_neon
+function idct32_1d_4x32_pass1\suffix\()_neon
@ Check if this whole input slice is zero
cmp r3, r1
ble 1f
@@ -1221,37 +1381,28 @@ function idct32_1d_4x32_pass1_neon
mov r12, #128
vmov.s16 d4, #0
- cmp r3, #34
- ble 3f
- cmp r3, #135
- ble 4f
-
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
.endr
-
- bl idct16
- sub r2, r2, r12, lsl #4
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
.endr
- bl idct16_quarter
- sub r2, r2, r12, lsl #2
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
.endr
- bl idct16_half
- sub r2, r2, r12, lsl #3
+.endif
+
+ bl idct16\suffix
-5:
@ Do four 4x4 transposes. Originally, d16-d31 contain the
@ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@ contain the transposed 4x4 blocks.
@@ -1274,39 +1425,42 @@ function idct32_1d_4x32_pass1_neon
sub r0, r0, #256
.purgem store_rev
- @ Move r2 to the first odd row
+ @ Move r2 back to the start of the input, and move
+ @ to the first odd row
+.ifb \suffix
+ sub r2, r2, r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+ sub r2, r2, r12, lsl #2
+.endif
+.ifc \suffix,_half
+ sub r2, r2, r12, lsl #3
+.endif
add r2, r2, #64
vmov.s16 d4, #0
-
- cmp r3, #34
- ble 3f
- cmp r3, #135
- ble 4f
-
@ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
.endr
-
- bl idct32_odd
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
.endr
- bl idct32_odd_quarter
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
.endr
- bl idct32_odd_half
+.endif
+
+ bl idct32_odd\suffix
-5:
transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30,
d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
@ Store the registers a, b, c, d horizontally,
@@ -1350,42 +1504,34 @@ endfunc
@ r0 = dst
@ r1 = dst stride
@ r2 = src (temp buffer)
-function idct32_1d_4x32_pass2_neon
+function idct32_1d_4x32_pass2\suffix\()_neon
push {lr}
movrel r12, idct_coeffs
vld1.16 {q0-q1}, [r12,:128]
mov r12, #128
-
- cmp r3, #34
- ble 3f
- cmp r3, #135
- ble 4f
-
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64], r12
.endr
sub r2, r2, r12, lsl #4
-
- bl idct16
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
vld1.16 {d\i}, [r2,:64], r12
.endr
sub r2, r2, r12, lsl #2
- bl idct16_quarter
- b 5f
-
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
vld1.16 {d\i}, [r2,:64], r12
.endr
sub r2, r2, r12, lsl #3
- bl idct16_half
+.endif
+
+ bl idct16\suffix
-5:
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vst1.16 {d\i}, [r2,:64], r12
.endr
@@ -1393,36 +1539,29 @@ function idct32_1d_4x32_pass2_neon
sub r2, r2, r12, lsl #4
add r2, r2, #64
- cmp r3, #34
- ble 3f
- cmp r3, #135
- ble 4f
-
@ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64], r12
.endr
sub r2, r2, r12, lsl #4
-
- bl idct32_odd
- b 5f
-
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
vld1.16 {d\i}, [r2,:64], r12
.endr
sub r2, r2, r12, lsl #2
- bl idct32_odd_quarter
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
vld1.16 {d\i}, [r2,:64], r12
.endr
sub r2, r2, r12, lsl #3
- bl idct32_odd_half
-
-5:
+.endif
sub r2, r2, #64
+
+ bl idct32_odd\suffix
+
mov r12, #128
.macro load_acc_store a, b, c, d, neg=0
vld1.16 {d4}, [r2,:64], r12
@@ -1471,6 +1610,11 @@ function idct32_1d_4x32_pass2_neon
.purgem load_acc_store
pop {pc}
endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
const min_eob_idct_idct_32, align=4
.short 0, 9, 34, 70, 135, 240, 336, 448
@@ -1479,6 +1623,10 @@ endconst
function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp r3, #1
beq idct32x32_dc_add_neon
+ cmp r3, #34
+ ble idct32x32_quarter_add_neon
+ cmp r3, #135
+ ble idct32x32_half_add_neon
push {r4-r8,lr}
vpush {q4-q7}
movrel r8, min_eob_idct_idct_32
@@ -1511,3 +1659,71 @@ A and r7, sp, #15
vpop {q4-q7}
pop {r4-r8,pc}
endfunc
+
+function idct32x32_quarter_add_neon
+ push {r4-r8,lr}
+ vpush {q4-q7}
+ movrel r8, min_eob_idct_idct_32
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #2048
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+.irp i, 0, 4
+ add r0, sp, #(\i*64)
+ ldrh r1, [r8, #(\i/2)]
+ add r2, r6, #(\i*2)
+ bl idct32_1d_4x32_pass1_quarter_neon
+.endr
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r0, r4, #(\i)
+ mov r1, r5
+ add r2, sp, #(\i*2)
+ bl idct32_1d_4x32_pass2_quarter_neon
+.endr
+
+ add sp, sp, r7
+ vpop {q4-q7}
+ pop {r4-r8,pc}
+endfunc
+
+function idct32x32_half_add_neon
+ push {r4-r8,lr}
+ vpush {q4-q7}
+ movrel r8, min_eob_idct_idct_32
+
+ @ Align the stack, allocate a temp buffer
+T mov r7, sp
+T and r7, r7, #15
+A and r7, sp, #15
+ add r7, r7, #2048
+ sub sp, sp, r7
+
+ mov r4, r0
+ mov r5, r1
+ mov r6, r2
+
+.irp i, 0, 4, 8, 12
+ add r0, sp, #(\i*64)
+ ldrh r1, [r8, #(\i/2)]
+ add r2, r6, #(\i*2)
+ bl idct32_1d_4x32_pass1_half_neon
+.endr
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add r0, r4, #(\i)
+ mov r1, r5
+ add r2, sp, #(\i*2)
+ bl idct32_1d_4x32_pass2_half_neon
+.endr
+
+ add sp, sp, r7
+ vpop {q4-q7}
+ pop {r4-r8,pc}
+endfunc
--
2.7.4
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel