This work is sponsored by, and copyright, Google.
This avoids having to fill the temp buffer with zeros for the
skipped slices, and leads to slightly more straightforward code
for these cases (for the 16x16 case, where the special case pass functions
are written out instead of templated from the same macro), instead of
riddling the common code with special case branches or macro .ifs.
The code size increases from 18548 bytes to 24580 bytes.
Before:
vp9_inv_dct_dct_16x16_sub1_add_neon: 236.7
vp9_inv_dct_dct_16x16_sub4_add_neon: 714.2
vp9_inv_dct_dct_16x16_sub8_add_neon: 926.8
vp9_inv_dct_dct_16x16_sub12_add_neon: 1402.3
vp9_inv_dct_dct_16x16_sub16_add_neon: 1405.9
vp9_inv_dct_dct_32x32_sub1_add_neon: 554.1
vp9_inv_dct_dct_32x32_sub4_add_neon: 3958.8
vp9_inv_dct_dct_32x32_sub8_add_neon: 3958.8
vp9_inv_dct_dct_32x32_sub12_add_neon: 5461.1
vp9_inv_dct_dct_32x32_sub16_add_neon: 5467.4
vp9_inv_dct_dct_32x32_sub20_add_neon: 7175.4
vp9_inv_dct_dct_32x32_sub24_add_neon: 7172.5
vp9_inv_dct_dct_32x32_sub28_add_neon: 8136.8
vp9_inv_dct_dct_32x32_sub32_add_neon: 8135.9
After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 236.7
vp9_inv_dct_dct_16x16_sub4_add_neon: 644.0
vp9_inv_dct_dct_16x16_sub8_add_neon: 854.0
vp9_inv_dct_dct_16x16_sub12_add_neon: 1393.8
vp9_inv_dct_dct_16x16_sub16_add_neon: 1392.6
vp9_inv_dct_dct_32x32_sub1_add_neon: 556.6
vp9_inv_dct_dct_32x32_sub4_add_neon: 3684.3
vp9_inv_dct_dct_32x32_sub8_add_neon: 3682.6
vp9_inv_dct_dct_32x32_sub12_add_neon: 5316.3
vp9_inv_dct_dct_32x32_sub16_add_neon: 5315.9
vp9_inv_dct_dct_32x32_sub20_add_neon: 7146.4
vp9_inv_dct_dct_32x32_sub24_add_neon: 7151.5
vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8
vp9_inv_dct_dct_32x32_sub32_add_neon: 8117.5
---
This reverts parts of the previous commit (changing some register uses to
another register); if both are to be applied, they should be applied
squashed together. (And similarly for review, it's much easier to squash the
two and review the end result.) They are presented sequentially as two steps,
to show the effect on runtime and code size of each alternative.
---
libavcodec/aarch64/vp9itxfm_neon.S | 737 +++++++++++++++++++++++--------------
1 file changed, 458 insertions(+), 279 deletions(-)
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S
b/libavcodec/aarch64/vp9itxfm_neon.S
index d74245f..78041d3 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -710,6 +710,51 @@ endfunc
st1 {v2.8h}, [\src], \inc
.endm
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7,
tmp1, tmp2
+ srshr \coef0, \coef0, #6
+ ld1 {v2.8b}, [x0], x1
+ srshr \coef1, \coef1, #6
+ ld1 {v3.8b}, [x3], x1
+ srshr \coef2, \coef2, #6
+ ld1 {v4.8b}, [x0], x1
+ srshr \coef3, \coef3, #6
+ uaddw \coef0, \coef0, v2.8b
+ ld1 {v5.8b}, [x3], x1
+ uaddw \coef1, \coef1, v3.8b
+ srshr \coef4, \coef4, #6
+ ld1 {v6.8b}, [x0], x1
+ srshr \coef5, \coef5, #6
+ ld1 {v7.8b}, [x3], x1
+ sqxtun v2.8b, \coef0
+ srshr \coef6, \coef6, #6
+ sqxtun v3.8b, \coef1
+ srshr \coef7, \coef7, #6
+ uaddw \coef2, \coef2, v4.8b
+ ld1 {\tmp1}, [x0], x1
+ uaddw \coef3, \coef3, v5.8b
+ ld1 {\tmp2}, [x3], x1
+ sqxtun v4.8b, \coef2
+ sub x0, x0, x1, lsl #2
+ sub x3, x3, x1, lsl #2
+ sqxtun v5.8b, \coef3
+ uaddw \coef4, \coef4, v6.8b
+ st1 {v2.8b}, [x0], x1
+ uaddw \coef5, \coef5, v7.8b
+ st1 {v3.8b}, [x3], x1
+ sqxtun v6.8b, \coef4
+ st1 {v4.8b}, [x0], x1
+ sqxtun v7.8b, \coef5
+ st1 {v5.8b}, [x3], x1
+ uaddw \coef6, \coef6, \tmp1
+ st1 {v6.8b}, [x0], x1
+ uaddw \coef7, \coef7, \tmp2
+ st1 {v7.8b}, [x3], x1
+ sqxtun \tmp1, \coef6
+ sqxtun \tmp2, \coef7
+ st1 {\tmp1}, [x0], x1
+ st1 {\tmp2}, [x3], x1
+.endm
+
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
// transpose into a horizontal 16x8 slice and store.
// x0 = dst (temp buffer)
@@ -728,37 +773,12 @@ function \txfm\()16_1d_8x16_pass1_neon
mov x9, #32
movi v2.8h, #0
-
-.ifc \txfm,idct
- cmp w3, #10
- b.le 3f
- cmp w3, #38
- b.le 4f
-.endif
-
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
bl \txfm\()16
-.ifc \txfm,idct
- b 5f
-3:
-.irp i, 16, 17, 18, 19
- load_clear \i, x2, x9
-.endr
- bl idct16_quarter
- b 5f
-
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- load_clear \i, x2, x9
-.endr
- bl idct16_half
-.endif
-
-5:
// Do two 8x8 transposes. Originally, v16-v31 contain the
// 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
// transposed 8x8 blocks.
@@ -812,92 +832,25 @@ endfunc
// x0 = dst
// x1 = dst stride
// x2 = src (temp buffer)
-// w3 = eob
-// x13 = slice offset
+// x3 = slice offset
function \txfm\()16_1d_8x16_pass2_neon
mov x14, x30
mov x9, #32
-.ifc \txfm,idct
- cmp w3, #10
- b.le 3f
- cmp w3, #38
- b.le 4f
-.endif
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
- cbz x13, 1f
+ cbz x3, 1f
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
load \i, x2, x9
.endr
1:
+ add x3, x0, x1
+ lsl x1, x1, #1
bl \txfm\()16
-.ifc \txfm,idct
- b 5f
-3:
-.irp i, 16, 17, 18, 19
- load \i, x2, x9
-.endr
- bl idct16_quarter
- b 5f
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- load \i, x2, x9
-.endr
- bl idct16_half
-.endif
-5:
- add x8, x0, x1
- lsl x1, x1, #1
-.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7,
tmp1, tmp2
- srshr \coef0, \coef0, #6
- ld1 {v2.8b}, [x0], x1
- srshr \coef1, \coef1, #6
- ld1 {v3.8b}, [x8], x1
- srshr \coef2, \coef2, #6
- ld1 {v4.8b}, [x0], x1
- srshr \coef3, \coef3, #6
- uaddw \coef0, \coef0, v2.8b
- ld1 {v5.8b}, [x8], x1
- uaddw \coef1, \coef1, v3.8b
- srshr \coef4, \coef4, #6
- ld1 {v6.8b}, [x0], x1
- srshr \coef5, \coef5, #6
- ld1 {v7.8b}, [x8], x1
- sqxtun v2.8b, \coef0
- srshr \coef6, \coef6, #6
- sqxtun v3.8b, \coef1
- srshr \coef7, \coef7, #6
- uaddw \coef2, \coef2, v4.8b
- ld1 {\tmp1}, [x0], x1
- uaddw \coef3, \coef3, v5.8b
- ld1 {\tmp2}, [x8], x1
- sqxtun v4.8b, \coef2
- sub x0, x0, x1, lsl #2
- sub x8, x8, x1, lsl #2
- sqxtun v5.8b, \coef3
- uaddw \coef4, \coef4, v6.8b
- st1 {v2.8b}, [x0], x1
- uaddw \coef5, \coef5, v7.8b
- st1 {v3.8b}, [x8], x1
- sqxtun v6.8b, \coef4
- st1 {v4.8b}, [x0], x1
- sqxtun v7.8b, \coef5
- st1 {v5.8b}, [x8], x1
- uaddw \coef6, \coef6, \tmp1
- st1 {v6.8b}, [x0], x1
- uaddw \coef7, \coef7, \tmp2
- st1 {v7.8b}, [x8], x1
- sqxtun \tmp1, \coef6
- sqxtun \tmp2, \coef7
- st1 {\tmp1}, [x0], x1
- st1 {\tmp2}, [x8], x1
-.endm
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h,
v22.8h, v23.8h, v16.8b, v17.8b
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h,
v30.8h, v31.8h, v16.8b, v17.8b
-.purgem load_add_store
br x14
endfunc
@@ -916,6 +869,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon,
export=1
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #1
b.eq idct16x16_dc_add_neon
+ cmp w3, #10
+ b.eq idct16x16_quarter_add_neon
+ cmp w3, #38
+ b.eq idct16x16_half_add_neon
.endif
mov x15, x30
// iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
@@ -936,7 +893,6 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifnc \txfm1\()_\txfm2,idct_idct
movrel x11, iadst16_coeffs
mov x7, #0
- mov w3, #256
.else
movrel x12, min_eob_idct_idct_16
.endif
@@ -960,7 +916,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
add x0, x4, #(\i)
mov x1, x5
add x2, sp, #(\i*2)
- mov x13, #\i
+ mov x3, #\i
bl \txfm2\()16_1d_8x16_pass2_neon
.endr
@@ -980,6 +936,163 @@ itxfm_func16x16 iadst, idct
itxfm_func16x16 idct, iadst
itxfm_func16x16 iadst, iadst
+function idct16_1d_8x16_pass1_quarter_neon
+ mov x14, x30
+ mov x9, #32
+ movi v2.8h, #0
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_quarter
+
+ // Do two 8x8 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+ // transposed 8x8 blocks.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ // Store the transposed 8x8 blocks horizontally.
+ // The first 8x8 block is kept in registers for the second pass,
+ // store the rest in the temp buffer.
+ // Since only a 4x4 part of the input was nonzero,
+ // this means that only 4 rows are nonzero after transposing, and
+ // the second pass only reads the topmost 4 rows. Therefore only
+ // store the topmost 4 rows.
+.irp i, 24, 25, 26, 27
+ add x0, x0, #16
+ store \i, x0, #16
+.endr
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass2_quarter_neon
+ mov x14, x30
+ mov x9, #32
+ cbz x3, 1f
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_quarter
+
+ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h,
v22.8h, v23.8h, v16.8b, v17.8b
+ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h,
v30.8h, v31.8h, v16.8b, v17.8b
+
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass1_half_neon
+ mov x14, x30
+ mov x9, #32
+ movi v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_half
+
+ // Do two 8x8 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+ // transposed 8x8 blocks.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ // Store the transposed 8x8 blocks horizontally.
+ // The first 8x8 block is kept in registers for the second pass,
+ // store the rest in the temp buffer.
+ // Since only a 4x4 part of the input was nonzero,
+ // this means that only 4 rows are nonzero after transposing, and
+ // the second pass only reads the topmost 4 rows. Therefore only
+ // store the topmost 4 rows.
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+ add x0, x0, #16
+ store \i, x0, #16
+.endr
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass2_half_neon
+ mov x14, x30
+ mov x9, #32
+ cbz x3, 1f
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_half
+
+ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h,
v22.8h, v23.8h, v16.8b, v17.8b
+ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h,
v30.8h, v31.8h, v16.8b, v17.8b
+
+ br x14
+endfunc
+
+function idct16x16_quarter_add_neon, export=1
+ mov x15, x30
+
+ sub sp, sp, #512
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ movrel x10, idct_coeffs
+ ld1 {v0.8h,v1.8h}, [x10]
+
+.irp i, 0
+ add x0, sp, #(\i*32)
+ mov x1, #\i
+ add x2, x6, #(\i*2)
+ bl idct16_1d_8x16_pass1_quarter_neon
+.endr
+.irp i, 0, 8
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ mov x3, #\i
+ bl idct16_1d_8x16_pass2_quarter_neon
+.endr
+
+ add sp, sp, #512
+ br x15
+endfunc
+
+function idct16x16_half_add_neon, export=1
+ mov x15, x30
+
+ sub sp, sp, #512
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ movrel x10, idct_coeffs
+ ld1 {v0.8h,v1.8h}, [x10]
+
+.irp i, 0
+ add x0, sp, #(\i*32)
+ mov x1, #\i
+ add x2, x6, #(\i*2)
+ bl idct16_1d_8x16_pass1_half_neon
+.endr
+.irp i, 0, 8
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ mov x3, #\i
+ bl idct16_1d_8x16_pass2_half_neon
+.endr
+
+ add sp, sp, #512
+ br x15
+endfunc
function idct32x32_dc_add_neon
movrel x4, idct_coeffs
@@ -1160,6 +1273,85 @@ function idct32_odd_quarter
endfunc
+// Store the registers a, b horizontally, followed by the
+// same registers b, a mirrored.
+.macro store_rev1 a, b
+ // There's no rev128 instruction, but we reverse each 64 bit
+ // half, and then flip them using an ext with 8 bytes offset.
+ rev64 v1.8h, v\b\().8h
+ st1 {v\a\().8h}, [x0], #16
+ rev64 v0.8h, v\a\().8h
+ ext v1.16b, v1.16b, v1.16b, #8
+ st1 {v\b\().8h}, [x0], #16
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v1.8h}, [x0], #16
+ st1 {v0.8h}, [x0], #16
+.endm
+
+// Store the registers a, b horizontally,
+// adding into the output first, and the mirrored,
+// subtracted from the output.
+.macro store_rev2 a, b
+ ld1 {v4.8h}, [x0]
+ rev64 v1.8h, v\b\().8h
+ add v4.8h, v4.8h, v\a\().8h
+ rev64 v0.8h, v\a\().8h
+ st1 {v4.8h}, [x0], #16
+ ext v1.16b, v1.16b, v1.16b, #8
+ ld1 {v5.8h}, [x0]
+ ext v0.16b, v0.16b, v0.16b, #8
+ add v5.8h, v5.8h, v\b\().8h
+ st1 {v5.8h}, [x0], #16
+ ld1 {v6.8h}, [x0]
+ sub v6.8h, v6.8h, v1.8h
+ st1 {v6.8h}, [x0], #16
+ ld1 {v7.8h}, [x0]
+ sub v7.8h, v7.8h, v0.8h
+ st1 {v7.8h}, [x0], #16
+.endm
+
+.macro load_acc_store a, b, c, d, neg=0
+ ld1 {v4.8h}, [x2], x9
+ ld1 {v5.8h}, [x2], x9
+.if \neg == 0
+ add v4.8h, v4.8h, v\a\().8h
+ ld1 {v6.8h}, [x2], x9
+ add v5.8h, v5.8h, v\b\().8h
+ ld1 {v7.8h}, [x2], x9
+ add v6.8h, v6.8h, v\c\().8h
+ add v7.8h, v7.8h, v\d\().8h
+.else
+ sub v4.8h, v4.8h, v\a\().8h
+ ld1 {v6.8h}, [x2], x9
+ sub v5.8h, v5.8h, v\b\().8h
+ ld1 {v7.8h}, [x2], x9
+ sub v6.8h, v6.8h, v\c\().8h
+ sub v7.8h, v7.8h, v\d\().8h
+.endif
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ srshr v4.8h, v4.8h, #6
+ ld1 {v2.8b}, [x0], x1
+ srshr v5.8h, v5.8h, #6
+ uaddw v4.8h, v4.8h, v0.8b
+ ld1 {v3.8b}, [x0], x1
+ srshr v6.8h, v6.8h, #6
+ uaddw v5.8h, v5.8h, v1.8b
+ srshr v7.8h, v7.8h, #6
+ sub x0, x0, x1, lsl #2
+ uaddw v6.8h, v6.8h, v2.8b
+ sqxtun v4.8b, v4.8h
+ uaddw v7.8h, v7.8h, v3.8b
+ sqxtun v5.8b, v5.8h
+ st1 {v4.8b}, [x0], x1
+ sqxtun v6.8b, v6.8h
+ st1 {v5.8b}, [x0], x1
+ sqxtun v7.8b, v7.8h
+ st1 {v6.8b}, [x0], x1
+ st1 {v7.8b}, [x0], x1
+.endm
+
+.macro idct32_funcs suffix
// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
// The 32-point IDCT can be decomposed into two 16-point IDCTs;
// a normal IDCT16 with every other input component (the even ones, with
@@ -1171,149 +1363,102 @@ endfunc
// w3 = eob
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
-function idct32_1d_8x32_pass1_neon
+function idct32_1d_8x32_pass1\suffix\()_neon
// Check if this whole input slice is zero
+.ifb \suffix
cmp w3, w1
b.le 1f
+.endif
mov x14, x30
ld1 {v0.8h,v1.8h}, [x10]
// Double stride of the input, since we only read every other line
mov x9, #128
- movi v4.8h, #0
-
- cmp w3, #4
- b.le 3f
- cmp w3, #135
- b.le 4f
+ movi v2.8h, #0
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
-
- bl idct16
- sub x2, x2, x9, lsl #4
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
- bl idct16_quarter
- sub x2, x2, x9, lsl #2
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
- bl idct16_half
- sub x2, x2, x9, lsl #3
+.endif
+
+ bl idct16\suffix
-5:
// Do two 8x8 transposes. Originally, v16-v31 contain the
// 16 rows. Afterwards, v16-v23 and v24-v31 contain the
// two transposed 8x8 blocks.
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
- // Store the registers a, b horizontally, followed by the
- // same registers b, a mirrored.
-.macro store_rev a, b
- // There's no rev128 instruction, but we reverse each 64 bit
- // half, and then flip them using an ext with 8 bytes offset.
- rev64 v1.8h, v\b\().8h
- st1 {v\a\().8h}, [x0], #16
- rev64 v0.8h, v\a\().8h
- ext v1.16b, v1.16b, v1.16b, #8
- st1 {v\b\().8h}, [x0], #16
- ext v0.16b, v0.16b, v0.16b, #8
- st1 {v1.8h}, [x0], #16
- st1 {v0.8h}, [x0], #16
-.endm
- store_rev 16, 24
- store_rev 17, 25
- store_rev 18, 26
- store_rev 19, 27
- store_rev 20, 28
- store_rev 21, 29
- store_rev 22, 30
- store_rev 23, 31
+ store_rev1 16, 24
+ store_rev1 17, 25
+ store_rev1 18, 26
+ store_rev1 19, 27
+ store_rev1 20, 28
+ store_rev1 21, 29
+ store_rev1 22, 30
+ store_rev1 23, 31
sub x0, x0, #512
-.purgem store_rev
- // Move x2 to the first odd row
+ // Move x2 back to the start of the input, and move
+ // to the first odd row
+.ifb \suffix
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+ sub x2, x2, x9, lsl #3
+.endif
add x2, x2, #64
- movi v4.8h, #0
-
- cmp w3, #34
- b.le 3f
- cmp w3, #135
- b.le 4f
-
+ movi v2.8h, #0
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
-
- bl idct32_odd
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
- bl idct32_odd_quarter
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
- bl idct32_odd_half
+.endif
+
+ bl idct32_odd\suffix
-5:
transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
- // Store the registers a, b horizontally,
- // adding into the output first, and the mirrored,
- // subtracted from the output.
-.macro store_rev a, b
- ld1 {v4.8h}, [x0]
- rev64 v1.8h, v\b\().8h
- add v4.8h, v4.8h, v\a\().8h
- rev64 v0.8h, v\a\().8h
- st1 {v4.8h}, [x0], #16
- ext v1.16b, v1.16b, v1.16b, #8
- ld1 {v5.8h}, [x0]
- ext v0.16b, v0.16b, v0.16b, #8
- add v5.8h, v5.8h, v\b\().8h
- st1 {v5.8h}, [x0], #16
- ld1 {v6.8h}, [x0]
- sub v6.8h, v6.8h, v1.8h
- st1 {v6.8h}, [x0], #16
- ld1 {v7.8h}, [x0]
- sub v7.8h, v7.8h, v0.8h
- st1 {v7.8h}, [x0], #16
-.endm
-
- store_rev 31, 23
- store_rev 30, 22
- store_rev 29, 21
- store_rev 28, 20
- store_rev 27, 19
- store_rev 26, 18
- store_rev 25, 17
- store_rev 24, 16
-.purgem store_rev
+ store_rev2 31, 23
+ store_rev2 30, 22
+ store_rev2 29, 21
+ store_rev2 28, 20
+ store_rev2 27, 19
+ store_rev2 26, 18
+ store_rev2 25, 17
+ store_rev2 24, 16
br x14
+.ifb \suffix
1:
// Write zeros to the temp buffer for pass 2
movi v16.8h, #0
@@ -1324,6 +1469,7 @@ function idct32_1d_8x32_pass1_neon
st1 {v16.8h-v19.8h}, [x0], #64
.endr
ret
+.endif
endfunc
// This is mostly the same as 8x32_pass1, but without the transpose,
@@ -1334,116 +1480,63 @@ endfunc
// x2 = src (temp buffer)
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
-function idct32_1d_8x32_pass2_neon
+function idct32_1d_8x32_pass2\suffix\()_neon
mov x14, x30
ld1 {v0.8h,v1.8h}, [x10]
mov x9, #128
-
- cmp w3, #34
- b.le 3f
- cmp w3, #135
- b.le 4f
-
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
-
- bl idct16
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #2
- bl idct16_quarter
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #3
- bl idct16_half
+.endif
+
+ bl idct16\suffix
-5:
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- st1 {v\i\().8h}, [x2], x9
+ store \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
add x2, x2, #64
- cmp w3, #34
- b.le 3f
- cmp w3, #135
- b.le 4f
-
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
-
- bl idct32_odd
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #2
- bl idct32_odd_quarter
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #3
- bl idct32_odd_half
-
-5:
- sub x2, x2, #64
-.macro load_acc_store a, b, c, d, neg=0
- ld1 {v4.8h}, [x2], x9
- ld1 {v5.8h}, [x2], x9
-.if \neg == 0
- add v4.8h, v4.8h, v\a\().8h
- ld1 {v6.8h}, [x2], x9
- add v5.8h, v5.8h, v\b\().8h
- ld1 {v7.8h}, [x2], x9
- add v6.8h, v6.8h, v\c\().8h
- add v7.8h, v7.8h, v\d\().8h
-.else
- sub v4.8h, v4.8h, v\a\().8h
- ld1 {v6.8h}, [x2], x9
- sub v5.8h, v5.8h, v\b\().8h
- ld1 {v7.8h}, [x2], x9
- sub v6.8h, v6.8h, v\c\().8h
- sub v7.8h, v7.8h, v\d\().8h
.endif
- ld1 {v0.8b}, [x0], x1
- ld1 {v1.8b}, [x0], x1
- srshr v4.8h, v4.8h, #6
- ld1 {v2.8b}, [x0], x1
- srshr v5.8h, v5.8h, #6
- uaddw v4.8h, v4.8h, v0.8b
- ld1 {v3.8b}, [x0], x1
- srshr v6.8h, v6.8h, #6
- uaddw v5.8h, v5.8h, v1.8b
- srshr v7.8h, v7.8h, #6
- sub x0, x0, x1, lsl #2
- uaddw v6.8h, v6.8h, v2.8b
- sqxtun v4.8b, v4.8h
- uaddw v7.8h, v7.8h, v3.8b
- sqxtun v5.8b, v5.8h
- st1 {v4.8b}, [x0], x1
- sqxtun v6.8b, v6.8h
- st1 {v5.8b}, [x0], x1
- sqxtun v7.8b, v7.8h
- st1 {v6.8b}, [x0], x1
- st1 {v7.8b}, [x0], x1
-.endm
+ sub x2, x2, #64
+
+ bl idct32_odd\suffix
+
load_acc_store 31, 30, 29, 28
load_acc_store 27, 26, 25, 24
load_acc_store 23, 22, 21, 20
@@ -1454,9 +1547,13 @@ function idct32_1d_8x32_pass2_neon
load_acc_store 20, 21, 22, 23, 1
load_acc_store 24, 25, 26, 27, 1
load_acc_store 28, 29, 30, 31, 1
-.purgem load_acc_store
br x14
endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
const min_eob_idct_idct_32, align=4
.short 0, 34, 135, 336
@@ -1465,6 +1562,10 @@ endconst
function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp w3, #1
b.eq idct32x32_dc_add_neon
+ cmp w3, #34
+ b.le idct32x32_quarter_add_neon
+ cmp w3, #135
+ b.le idct32x32_half_add_neon
movrel x10, idct_coeffs
add x11, x10, #32
@@ -1505,3 +1606,81 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
br x15
endfunc
+
+function idct32x32_quarter_add_neon
+ movrel x10, idct_coeffs
+ add x11, x10, #32
+
+ mov x15, x30
+
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+
+ sub sp, sp, #2048
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+.irp i, 0
+ add x0, sp, #(\i*64)
+ add x2, x6, #(\i*2)
+ bl idct32_1d_8x32_pass1_quarter_neon
+.endr
+.irp i, 0, 8, 16, 24
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ bl idct32_1d_8x32_pass2_quarter_neon
+.endr
+
+ add sp, sp, #2048
+
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+
+ br x15
+endfunc
+
+function idct32x32_half_add_neon
+ movrel x10, idct_coeffs
+ add x11, x10, #32
+
+ mov x15, x30
+
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+
+ sub sp, sp, #2048
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+.irp i, 0, 8
+ add x0, sp, #(\i*64)
+ add x2, x6, #(\i*2)
+ bl idct32_1d_8x32_pass1_half_neon
+.endr
+.irp i, 0, 8, 16, 24
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ bl idct32_1d_8x32_pass2_half_neon
+.endr
+
+ add sp, sp, #2048
+
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+
+ br x15
+endfunc
--
2.7.4
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel