The branch, master has been updated via 4fbacb39440f9951904b2ebc5ad76bf8019d478a (commit) from de25cb4603f938aafc1a182b76a6216de170e7ac (commit)
- Log ----------------------------------------------------------------- commit 4fbacb39440f9951904b2ebc5ad76bf8019d478a Author: Georgii Zagoruiko <george.zag...@gmail.com> AuthorDate: Tue Sep 9 22:10:54 2025 +0100 Commit: Georgii Zagoruiko <george.zag...@gmail.com> CommitDate: Tue Sep 9 22:13:04 2025 +0100 avcodec/aarch64/vvc: Optimised version of classify function. Macbook Air (M2): vvc_alf_classify_8x8_8_c: 2.6 ( 1.00x) vvc_alf_classify_8x8_8_neon: 1.0 ( 2.47x) vvc_alf_classify_8x8_10_c: 2.7 ( 1.00x) vvc_alf_classify_8x8_10_neon: 0.9 ( 2.98x) vvc_alf_classify_8x8_12_c: 2.7 ( 1.00x) vvc_alf_classify_8x8_12_neon: 0.9 ( 2.97x) vvc_alf_classify_16x16_8_c: 7.3 ( 1.00x) vvc_alf_classify_16x16_8_neon: 3.4 ( 2.12x) vvc_alf_classify_16x16_10_c: 4.3 ( 1.00x) vvc_alf_classify_16x16_10_neon: 2.9 ( 1.47x) vvc_alf_classify_16x16_12_c: 4.3 ( 1.00x) vvc_alf_classify_16x16_12_neon: 3.0 ( 1.44x) vvc_alf_classify_32x32_8_c: 13.7 ( 1.00x) vvc_alf_classify_32x32_8_neon: 10.7 ( 1.29x) vvc_alf_classify_32x32_10_c: 12.3 ( 1.00x) vvc_alf_classify_32x32_10_neon: 8.7 ( 1.42x) vvc_alf_classify_32x32_12_c: 12.2 ( 1.00x) vvc_alf_classify_32x32_12_neon: 8.7 ( 1.40x) vvc_alf_classify_64x64_8_c: 45.8 ( 1.00x) vvc_alf_classify_64x64_8_neon: 37.1 ( 1.23x) vvc_alf_classify_64x64_10_c: 41.3 ( 1.00x) vvc_alf_classify_64x64_10_neon: 32.8 ( 1.26x) vvc_alf_classify_64x64_12_c: 41.4 ( 1.00x) vvc_alf_classify_64x64_12_neon: 32.4 ( 1.28x) vvc_alf_classify_128x128_8_c: 163.7 ( 1.00x) vvc_alf_classify_128x128_8_neon: 138.3 ( 1.18x) vvc_alf_classify_128x128_10_c: 149.1 ( 1.00x) vvc_alf_classify_128x128_10_neon: 120.3 ( 1.24x) vvc_alf_classify_128x128_12_c: 148.7 ( 1.00x) vvc_alf_classify_128x128_12_neon: 119.4 ( 1.25x) RPi4 (Cortex-A72): vvc_alf_classify_8x8_8_c: 1251.6 ( 1.00x) vvc_alf_classify_8x8_8_neon: 700.7 ( 1.79x) vvc_alf_classify_8x8_10_c: 1141.9 ( 1.00x) vvc_alf_classify_8x8_10_neon: 659.7 ( 1.73x) vvc_alf_classify_8x8_12_c: 1075.8 ( 1.00x) vvc_alf_classify_8x8_12_neon: 658.7 ( 1.63x) vvc_alf_classify_16x16_8_c: 3574.1 ( 1.00x) vvc_alf_classify_16x16_8_neon: 1849.8 ( 1.93x) vvc_alf_classify_16x16_10_c: 3270.0 ( 1.00x) vvc_alf_classify_16x16_10_neon: 1786.1 ( 1.83x) vvc_alf_classify_16x16_12_c: 3271.7 ( 1.00x) vvc_alf_classify_16x16_12_neon: 1785.5 ( 1.83x) vvc_alf_classify_32x32_8_c: 12451.9 ( 1.00x) vvc_alf_classify_32x32_8_neon: 5984.3 ( 2.08x) vvc_alf_classify_32x32_10_c: 11428.9 ( 1.00x) vvc_alf_classify_32x32_10_neon: 5756.3 ( 1.99x) vvc_alf_classify_32x32_12_c: 11252.8 ( 1.00x) vvc_alf_classify_32x32_12_neon: 5755.7 ( 1.96x) vvc_alf_classify_64x64_8_c: 47625.5 ( 1.00x) vvc_alf_classify_64x64_8_neon: 21071.9 ( 2.26x) vvc_alf_classify_64x64_10_c: 44576.3 ( 1.00x) vvc_alf_classify_64x64_10_neon: 21544.7 ( 2.07x) vvc_alf_classify_64x64_12_c: 44600.5 ( 1.00x) vvc_alf_classify_64x64_12_neon: 21491.2 ( 2.08x) vvc_alf_classify_128x128_8_c: 192143.3 ( 1.00x) vvc_alf_classify_128x128_8_neon: 82387.6 ( 2.33x) vvc_alf_classify_128x128_10_c: 177583.1 ( 1.00x) vvc_alf_classify_128x128_10_neon: 81628.8 ( 2.18x) vvc_alf_classify_128x128_12_c: 177582.2 ( 1.00x) vvc_alf_classify_128x128_12_neon: 81625.1 ( 2.18x) diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S index 8801b3afb6..bd8317aba0 100644 --- a/libavcodec/aarch64/vvc/alf.S +++ b/libavcodec/aarch64/vvc/alf.S @@ -291,3 +291,208 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1 1: alf_filter_chroma_kernel 2 endfunc + +#define ALF_BLOCK_SIZE 4 +#define ALF_GRADIENT_STEP 2 +#define ALF_GRADIENT_BORDER 2 +#define ALF_NUM_DIR 4 +#define ALF_GRAD_BORDER_X2 (ALF_GRADIENT_BORDER * 2) +#define ALF_STRIDE_MUL (ALF_GRADIENT_BORDER + 1) +#define ALF_GRAD_X_VSTEP (ALF_GRADIENT_STEP * 8) +#define ALF_GSTRIDE_MUL (ALF_NUM_DIR / ALF_GRADIENT_STEP) + +// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP) +#define ALF_GSTRIDE_XG_BYTES (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP) + +#define ALF_GSTRIDE_SUB_BYTES (2 * ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR) + +#define ALF_CLASS_INC (ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP) +#define ALF_CLASS_END ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) + +.macro ff_alf_classify_grad pix_size + // class_idx .req x0 + // transpose_idx .req x1 + // _src .req x2 + // _src_stride .req x3 + // width .req w4 + // height .req w5 + // vb_pos .req w6 + // gradient_tmp .req x7 + + mov w16, #ALF_STRIDE_MUL + add w5, w5, #ALF_GRAD_BORDER_X2 // h = height + ALF_GRAD_BORDER_X2 + mul x16, x3, x16 // ALF_STRIDE_MUL * stride + add w4, w4, #ALF_GRAD_BORDER_X2 // w = width + ALF_GRAD_BORDER_X2 + sub x15, x2, x16 // src -= (ALF_STRIDE_MUL * stride) + mov x17, x7 + .if \pix_size == 1 + sub x15, x15, #ALF_GRADIENT_BORDER + .else + sub x15, x15, #ALF_GRAD_BORDER_X2 + .endif + mov w8, #0 // y loop: y = 0 +1: + add x16, x8, #1 + mul x16, x16, x3 + madd x10, x8, x3, x15 // s0 = src + y * stride + add x14, x16, x3 + add x11, x15, x16 // s1 + add x16, x14, x3 + add x12, x15, x14 // s2 + add x13, x15, x16 // s3 + + // if (y == vb_pos): s3 = s2 + cmp w8, w6 + add w16, w6, #ALF_GRADIENT_BORDER + csel x13, x12, x13, eq + // if (y == vb_pos + 2): s0 = s1 + cmp w8, w16 + csel x10, x11, x10, eq + + .if \pix_size == 1 + sub x10, x10, #1 // s0-1 + sub x11, x11, #2 + sub x12, x12, #2 + .else + sub x10, x10, #2 // s0-1 + sub x11, x11, #4 + sub x12, x12, #4 + .endif + + // x loop + mov w9, #0 + b 11f +2: + // Store operation starts from the second cycle + st2 {v4.8h, v5.8h}, [x17], #32 +11: + .if \pix_size == 1 + // Load 8 pixels: s0 & s1+2 + mov x16, #1 + mov x14, #7 + ld1 {v0.8b}, [x10], x16 // s0-1 + ld1 {v2.8b}, [x13], x16 // s3 + ld1 {v1.8b}, [x10], x14 // s0 + ld1 {v3.8b}, [x13], x14 // s3+1 + uxtl v16.8h, v0.8b + uxtl v20.8h, v1.8b + uxtl v28.8h, v2.8b + uxtl v19.8h, v3.8b + + mov x16, #2 + mov x14, #4 + ld1 {v0.8b}, [x11], x16 // s1-2 + ld1 {v3.8b}, [x12], x16 // s2-2 + ld1 {v1.8b}, [x11], x16 // s1 + ld1 {v4.8b}, [x12], x16 // s2 + ld1 {v2.8b}, [x11], x14 // s1+2 + ld1 {v5.8b}, [x12], x14 // s2+2 + uxtl v17.8h, v0.8b + uxtl v22.8h, v1.8b + uxtl v26.8h, v2.8b + uxtl v18.8h, v3.8b + uxtl v24.8h, v4.8b + uxtl v27.8h, v5.8b + .else + mov x16, #2 + mov x14, #14 + ld1 {v16.8h}, [x10], x16 // s0-1 + ld1 {v28.8h}, [x13], x16 // s3 + ld1 {v20.8h}, [x10], x14 // s0 + ld1 {v19.8h}, [x13], x14 // s3+1 + + mov x16, #4 + mov x14, #8 + ld1 {v17.8h}, [x11], x16 // s1-2 + ld1 {v18.8h}, [x12], x16 // s2-2 + ld1 {v22.8h}, [x11], x16 // s1 + ld1 {v24.8h}, [x12], x16 // s2 + ld1 {v26.8h}, [x11], x14 // s1+2 + ld1 {v27.8h}, [x12], x14 // s2+2 + .endif + + // Grad: Vertical & D0 (interleaved) + trn1 v21.8h, v20.8h, v16.8h // first abs: operand 1 + rev32 v23.8h, v22.8h // second abs: operand 1 + trn2 v29.8h, v28.8h, v19.8h // second abs: operand 2 + trn1 v30.8h, v22.8h, v22.8h + trn2 v31.8h, v24.8h, v24.8h + add v30.8h, v30.8h, v30.8h + add v31.8h, v31.8h, v31.8h + sub v0.8h, v30.8h, v21.8h + sub v1.8h, v31.8h, v23.8h + sabd v4.8h, v0.8h, v24.8h + + // Grad: Horizontal & D1 (interleaved) + trn2 v21.8h, v17.8h, v20.8h // first abs: operand 1 + saba v4.8h, v1.8h, v29.8h + trn2 v23.8h, v22.8h, v18.8h // first abs: operand 2 + trn1 v25.8h, v24.8h, v26.8h // second abs: operand 1 + trn1 v29.8h, v27.8h, v28.8h // second abs: operand 2 + sub v0.8h, v30.8h, v21.8h + sub v1.8h, v31.8h, v25.8h + add w9, w9, #8 // x += 8 + sabd v5.8h, v0.8h, v23.8h + cmp w9, w4 + saba v5.8h, v1.8h, v29.8h + b.lt 2b + + add w8, w8, #ALF_GRADIENT_STEP // y += ALF_GRADIENT_STEP + // 8 pixels -> 4 cycles of generic + // 4 pixels -> paddings => half needs to be saved + st2 {v4.4h, v5.4h}, [x17], #16 + cmp w8, w5 + b.lt 1b + ret +.endm + +.macro ff_alf_classify_sum + ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3 + uaddw v16.4s, v16.4s, v0.4h + uaddw v17.4s, v17.4s, v1.4h + uaddw v18.4s, v18.4s, v2.4h + uaddw2 v16.4s, v16.4s, v0.8h + uaddw2 v17.4s, v17.4s, v1.8h + uaddw2 v18.4s, v18.4s, v2.8h +.endm + +function ff_alf_classify_sum_neon, export=1 + // sum0 .req x0 + // sum1 .req x1 + // grad .req x2 + // gshift .req w3 + // steps .req w4 + lsl w3, w3, #1 + cmp w4, #4 + add w3, w3, #32 + + ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3 + uxtl v16.4s, v0.4h + uxtl v17.4s, v1.4h + uxtl v18.4s, v2.4h + uaddw2 v16.4s, v16.4s, v0.8h + uaddw2 v17.4s, v17.4s, v1.8h + uaddw2 v18.4s, v18.4s, v2.8h + ff_alf_classify_sum + ff_alf_classify_sum + + blt 60f + ff_alf_classify_sum +60: + add v16.4s, v16.4s, v17.4s + add v18.4s, v18.4s, v17.4s + st1 {v16.4s}, [x0] + st1 {v18.4s}, [x1] + ret +endfunc + +function ff_alf_classify_grad_8_neon, export=1 + ff_alf_classify_grad 1 +endfunc + +function ff_alf_classify_grad_10_neon, export=1 +endfunc + +function ff_alf_classify_grad_12_neon, export=1 + ff_alf_classify_grad 2 +endfunc diff --git a/libavcodec/aarch64/vvc/alf_template.c b/libavcodec/aarch64/vvc/alf_template.c index 41f7bf8995..364bd9cded 100644 --- a/libavcodec/aarch64/vvc/alf_template.c +++ b/libavcodec/aarch64/vvc/alf_template.c @@ -155,3 +155,89 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst, } } } + +#define ALF_DIR_VERT 0 +#define ALF_DIR_HORZ 1 +#define ALF_DIR_DIGA0 2 +#define ALF_DIR_DIGA1 3 + +static void FUNC(alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac) +{ + static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 }; + + int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1; + + dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ]; + hv1 = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]); + hv0 = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]); + + dir_d = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1]; + d1 = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]); + d0 = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]); + + //promote to avoid overflow + dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0; + hvd1 = dir1 ? hv1 : d1; + hvd0 = dir1 ? hv0 : d0; + + sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT]; + *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)]; + if (hvd1 * 2 > 9 * hvd0) + *class_idx += ((dir1 << 1) + 2) * 5; + else if (hvd1 > 2 * hvd0) + *class_idx += ((dir1 << 1) + 1) * 5; + + *transpose_idx = dir_d * 2 + dir_hv; +} + +static void FUNC(alf_classify)(int *class_idx, int *transpose_idx, + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, + const int vb_pos, int16_t *gradient_tmp) +{ + int16_t *grad; + + const int w = width + ALF_GRADIENT_BORDER * 2; + const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP; + const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR; + const int gshift = gstride - size * ALF_NUM_DIR; + + for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) { + int start = 0; + int end = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP; + int ac = 2; + if (y + ALF_BLOCK_SIZE == vb_pos) { + end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP; + ac = 3; + } else if (y == vb_pos) { + start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP; + ac = 3; + } + for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) { + const int xg = x / ALF_GRADIENT_STEP; + const int yg = y / ALF_GRADIENT_STEP; + int sum0[ALF_NUM_DIR]; + int sum1[ALF_NUM_DIR]; + grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR; + ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start); + FUNC(alf_get_idx)(class_idx, transpose_idx, sum0, ac); + class_idx++; + transpose_idx++; + FUNC(alf_get_idx)(class_idx, transpose_idx, sum1, ac); + class_idx++; + transpose_idx++; + } + } + +} + +void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx, + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, + const int vb_pos, int16_t *gradient_tmp); + +static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx, + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height, + const int vb_pos, int *gradient_tmp) +{ + FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp); + FUNC(alf_classify)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp); +} diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index 08204063f9..bdfa142a5a 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -30,6 +30,9 @@ #define BDOF_BLOCK_SIZE 16 #define BDOF_MIN_BLOCK_SIZE 4 + +void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps); + #define BIT_DEPTH 8 #include "alf_template.c" #undef BIT_DEPTH @@ -203,6 +206,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon; c->alf.filter[LUMA] = alf_filter_luma_8_neon; c->alf.filter[CHROMA] = alf_filter_chroma_8_neon; + c->alf.classify = alf_classify_8_neon; if (have_i8mm(cpu_flags)) { c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm; @@ -242,6 +246,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->alf.filter[LUMA] = alf_filter_luma_10_neon; c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; + c->alf.classify = alf_classify_10_neon; } else if (bd == 12) { c->inter.avg = ff_vvc_avg_12_neon; c->inter.w_avg = vvc_w_avg_12; @@ -252,6 +257,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->alf.filter[LUMA] = alf_filter_luma_12_neon; c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; + c->alf.classify = alf_classify_12_neon; } c->inter.sad = ff_vvc_sad_neon; ----------------------------------------------------------------------- Summary of changes: libavcodec/aarch64/vvc/alf.S | 205 ++++++++++++++++++++++++++++++++++ libavcodec/aarch64/vvc/alf_template.c | 86 ++++++++++++++ libavcodec/aarch64/vvc/dsp_init.c | 6 + 3 files changed, 297 insertions(+) hooks/post-receive -- _______________________________________________ ffmpeg-cvslog mailing list -- ffmpeg-cvslog@ffmpeg.org To unsubscribe send an email to ffmpeg-cvslog-le...@ffmpeg.org