Thank you, Martin! I will have a look at it over the next week! On Fri, Aug 1, 2025 at 6:50 PM Martin Storsjö <mar...@martin.st> wrote:
> On Wed, 9 Jul 2025, Georgii Zagoruiko wrote: > > > diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S > > index 8801b3afb6..9c9765ead1 100644 > > --- a/libavcodec/aarch64/vvc/alf.S > > +++ b/libavcodec/aarch64/vvc/alf.S > > @@ -291,3 +291,281 @@ function ff_alf_filter_chroma_kernel_10_neon, > export=1 > > 1: > > alf_filter_chroma_kernel 2 > > endfunc > > + > > + > > + > > +.macro alf_classify_argvar2v30 > > + mov w16, #0 > > + mov v30.b[0], w16 > > + mov w16, #1 > > + mov v30.b[1], w16 > > + mov w16, #2 > > + mov v30.b[2], w16 > > + mov v30.b[3], w16 > > + mov v30.b[4], w16 > > + mov v30.b[5], w16 > > + mov v30.b[6], w16 > > + mov w16, #3 > > + mov v30.b[7], w16 > > + mov v30.b[8], w16 > > + mov v30.b[9], w16 > > + mov v30.b[10], w16 > > + mov v30.b[11], w16 > > + mov v30.b[12], w16 > > + mov v30.b[13], w16 > > + mov v30.b[14], w16 > > + mov w16, #4 > > + mov v30.b[15], w16 > > Wouldn't it be more efficient to just have this pattern stored as a > constant somewhere, and do a single load instruction (with some latency) > to load it, rather than a 19 instruction sequence to initialize it? > > > +.endm > > + > > +.macro alf_classify_load_pixel pix_size, dst, src > > + .if \pix_size == 1 > > + ldrb \dst, [\src], #1 > > + .else > > + ldrh \dst, [\src], #2 > > + .endif > > +.endm > > + > > +.macro alf_classify_load_pixel_with_offset pix_size, dst, src, offset > > + .if \pix_size == 1 > > + ldrb \dst, [\src, #(\offset)] > > + .else > > + ldrh \dst, [\src, #(2*\offset)] > > + .endif > > +.endm > > + > > +#define ALF_BLOCK_SIZE 4 > > +#define ALF_GRADIENT_STEP 2 > > +#define ALF_GRADIENT_BORDER 2 > > +#define ALF_NUM_DIR 4 > > +#define ALF_GRAD_BORDER_X2 (ALF_GRADIENT_BORDER * 2) > > +#define ALF_STRIDE_MUL (ALF_GRADIENT_BORDER + 1) > > +#define ALF_GRAD_X_VSTEP (ALF_GRADIENT_STEP * 8) > > +#define ALF_GSTRIDE_MUL (ALF_NUM_DIR / ALF_GRADIENT_STEP) > > + > > +// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP) > > +#define ALF_GSTRIDE_XG_BYTES (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP) > > + > > +#define ALF_GSTRIDE_SUB_BYTES (2 * ((ALF_BLOCK_SIZE + > ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR) > > + > > +#define ALF_CLASS_INC (ALF_GRADIENT_BORDER / > ALF_GRADIENT_STEP) > > +#define ALF_CLASS_END ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER > * 2) / ALF_GRADIENT_STEP) > > + > > +.macro ff_alf_classify_grad pix_size > > + // class_idx .req x0 > > + // transpose_idx .req x1 > > + // _src .req x2 > > + // _src_stride .req x3 > > + // width .req w4 > > + // height .req w5 > > + // vb_pos .req w6 > > + // gradient_tmp .req x7 > > + > > + mov w16, #ALF_STRIDE_MUL > > + add w5, w5, #ALF_GRAD_BORDER_X2 // h = height + 4 > > + mul x16, x3, x16 // 3 * stride > > + add w4, w4, #ALF_GRAD_BORDER_X2 // w = width + 4 > > + sub x15, x2, x16 // src -= (3 * > stride) > > + mov x17, x7 > > + .if \pix_size == 1 > > + sub x15, x15, #ALF_GRADIENT_BORDER > > + .else > > + sub x15, x15, #4 > > + .endif > > + mov w8, #0 // y loop: y = 0 > > +1: > > + cmp w8, w5 > > + bge 10f > > + > > + add x16, x8, #1 > > + mul x14, x8, x3 // y * stride > > + mul x16, x16, x3 > > + add x10, x15, x14 // s0 = src + y * > stride > > + add x14, x16, x3 > > + add x11, x15, x16 // s1 > > + add x16, x14, x3 > > + add x12, x15, x14 // s2 > > + add x13, x15, x16 // s3 > > + > > + // if (y == vb_pos): s3 = s2 > > + cmp w8, w6 > > + add w16, w6, #ALF_GRADIENT_BORDER > > + csel x13, x12, x13, eq > > + // if (y == vb_pos + 2): s0 = s1 > > + cmp w8, w16 > > + csel x10, x11, x10, eq > > + > > + alf_classify_load_pixel_with_offset \pix_size, w16, x10, -1 > > + alf_classify_load_pixel \pix_size, w14, x13 > > + mov v16.h[7], w16 > > + mov v28.h[7], w14 > > + > > + // load 4 pixels from *(s1-2) & *(s2-2) > > + .if \pix_size == 1 > > + sub x11, x11, #2 > > + sub x12, x12, #2 > > + ld1 {v0.8b}, [x11] > > + ld1 {v1.8b}, [x12] > > + uxtl v17.8h, v0.8b > > + uxtl v18.8h, v1.8b > > + .else > > + sub x11, x11, #4 > > + sub x12, x12, #4 > > + ld1 {v17.8h}, [x11] > > + ld1 {v18.8h}, [x12] > > + .endif > > + ext v22.16b, v22.16b, v17.16b, #8 > > + ext v24.16b, v24.16b, v18.16b, #8 > > + .if \pix_size == 1 > > + add x11, x11, #4 > > + add x12, x12, #4 > > + .else > > + add x11, x11, #8 > > + add x12, x12, #8 > > + .endif > > + > > + // x loop > > + mov w9, #0 > > + b 11f > > +2: > > + cmp w9, w4 > > + bge 20f > > Instead of having w9 count up to w4, it'd be more common to just decrement > one register instead. Here you could do "mov w9, w4" instead of "mov w9, > #0", and just decrement w9 with "subs w9, w9, #8" at the end, which allows > you to do the branch as "b.gt 2b", avoiding some extra jumping around at > the end of the loop. Same for the outer loop for w8/w5. > > > + > > + // Store operation starts from the second cycle > > + st2 {v4.8h, v5.8h}, [x17], #32 > > +11: > > + .if \pix_size == 1 > > + // Load 8 pixels: s0 & s1+2 > > + ld1 {v0.8b}, [x10], #8 > > + ld1 {v1.8b}, [x11], #8 > > + uxtl v20.8h, v0.8b > > + uxtl v26.8h, v1.8b > > + // Load 8 pixels: s2+2 & s3+1 > > + ld1 {v0.8b}, [x12], #8 > > + ld1 {v1.8b}, [x13], #8 > > + uxtl v27.8h, v0.8b > > + uxtl v19.8h, v1.8b > > + .else > > + // Load 8 pixels: s0 > > + ld1 {v20.8h}, [x10], #16 > > + // Load 8 pixels: s1+2 > > + ld1 {v26.8h}, [x11], #16 > > + // Load 8 pixels: s2+2 > > + ld1 {v27.8h}, [x12], #16 > > + // Load 8 pixels: s3+1 > > + ld1 {v19.8h}, [x13], #16 > > + .endif > > + > > + ext v16.16b, v16.16b, v20.16b, #14 > > + ext v28.16b, v28.16b, v19.16b, #14 > > + > > + ext v17.16b, v22.16b, v26.16b, #8 > > + ext v22.16b, v22.16b, v26.16b, #12 > > + > > + ext v18.16b, v24.16b, v27.16b, #8 > > + ext v24.16b, v24.16b, v27.16b, #12 > > + > > + // Grad: Vertical & D0 (interleaved) > > + trn1 v21.8h, v20.8h, v16.8h // first abs: > operand 1 > > + rev32 v23.8h, v22.8h // second abs: > operand 1 > > + trn2 v29.8h, v28.8h, v19.8h // second abs: > operand 2 > > + trn1 v30.8h, v22.8h, v22.8h > > + trn2 v31.8h, v24.8h, v24.8h > > + add v30.8h, v30.8h, v30.8h > > + add v31.8h, v31.8h, v31.8h > > + sub v0.8h, v30.8h, v21.8h > > + sub v1.8h, v31.8h, v23.8h > > + sabd v4.8h, v0.8h, v24.8h > > + > > + // Grad: Horizontal & D1 (interleaved) > > + trn2 v21.8h, v17.8h, v20.8h // first abs: > operand 1 > > + saba v4.8h, v1.8h, v29.8h > > + trn2 v23.8h, v22.8h, v18.8h // first abs: > operand 2 > > + trn1 v25.8h, v24.8h, v26.8h // second abs: > operand 1 > > + trn1 v29.8h, v27.8h, v28.8h // second abs: > operand 2 > > + sub v0.8h, v30.8h, v21.8h > > + sub v1.8h, v31.8h, v25.8h > > + sabd v5.8h, v0.8h, v23.8h > > + > > + // Prepare for the next interation: > > + mov v16.16b, v20.16b > > + saba v5.8h, v1.8h, v29.8h > > + mov v28.16b, v19.16b > > + mov v22.16b, v26.16b > > + mov v24.16b, v27.16b > > + > > + add w9, w9, #8 // x += 8 > > + b 2b > > +20: > > + // 8 pixels -> 4 cycles of generic > > + // 4 pixels -> paddings => half needs to be saved > > + st2 {v4.4h, v5.4h}, [x17], #16 > > + > > + add w8, w8, #ALF_GRADIENT_STEP // y += 2 > > + b 1b > > +10: > > + ret > > +.endm > > + > > +.macro ff_alf_classify_sum > > This doesn't seem to need to be a macro, as it is only invoked once, > without any parameters? > > > + // sum0 .req x0 > > + // sum1 .req x1 > > + // grad .req x2 > > + // gshift .req w3 > > + // steps .req w4 > > + mov w5, #2 > > + mov w6, #0 > > + mul w3, w3, w5 > > If you want to multiply by 2, then just do "lsl w3, w3, #1". > > > + movi v16.4s, #0 > > + movi v21.4s, #0 > > +6: > > + prfm pldl1keep, [x2] > > No prefetch instructions please. If you feel strongly about it, send a > separate patch afterwards to add the prefetch instructions, with > repeatable benchmark nubmers. > > > + cmp w6, w4 > > + bge 60f > > + > > + ld1 {v17.4h}, [x2], #8 > > + ld1 {v18.4h}, [x2], #8 > > + uxtl v17.4s, v17.4h > > + ld1 {v19.4h}, [x2], #8 > > + uxtl v18.4s, v18.4h > > + ld1 {v20.4h}, [x2], #8 > > + uxtl v19.4s, v19.4h > > + ld1 {v22.4h}, [x2], #8 > > + uxtl v20.4s, v20.4h > > + ld1 {v23.4h}, [x2] > > This sequence of 6 separate sequential loads of .4h registers could be > expressed as two loads, with {.4h, .4h, .4h, .4h} and {.4h, .4h}, or three > loads of {.4h, .4h} - that should probably be more efficient than doing 6 > separate loads. > > > + uxtl v22.4s, v22.4h > > + uxtl v23.4s, v23.4h > > + add v17.4s, v17.4s, v18.4s > > + add v16.4s, v16.4s, v17.4s > > As this addition to v16 uses v17, which was updated in the directly > preceding instruction, this causes stalls on in-order cores. It'd be > better to move this add instruction down by one or two instructions. > > > + add v19.4s, v19.4s, v20.4s > > + add v22.4s, v22.4s, v23.4s > > + add v21.4s, v21.4s, v19.4s > > + add v16.4s, v16.4s, v19.4s > > + add v21.4s, v21.4s, v22.4s > > + > > + sub x2, x2, #8 > > + add w6, w6, #1 // i += 1 > > Instead of this register w6 counting up to w4, it'd be more idiomatic and > requiring one register less, if you'd just decrement w4 instead, with a > "subs w4, w4, #1" instruction. Then you can just do "b.gt 6b" at the end, > rather than having to jump back to 6 to do the comparison there. > > > + add x2, x2, x3 // grad += gstride > - size * ALF_NUM_DIR > > Instead of having to subtract 8 from x2 at the end of each loop, just > subtract 8 from x3 before the loop. > > > + b 6b > > +60: > > + st1 {v16.4s}, [x0] > > + st1 {v21.4s}, [x1] > > + ret > > +.endm > > + > > + > > +function ff_alf_classify_sum_neon, export=1 > > + ff_alf_classify_sum > > +endfunc > > + > > +function ff_alf_classify_grad_8_neon, export=1 > > + ff_alf_classify_grad 1 > > +endfunc > > + > > +function ff_alf_classify_grad_10_neon, export=1 > > + ff_alf_classify_grad 2 > > +endfunc > > + > > +function ff_alf_classify_grad_12_neon, export=1 > > + ff_alf_classify_grad 2 > > +endfunc > > diff --git a/libavcodec/aarch64/vvc/alf_template.c > b/libavcodec/aarch64/vvc/alf_template.c > > index 41f7bf8995..470222a634 100644 > > --- a/libavcodec/aarch64/vvc/alf_template.c > > +++ b/libavcodec/aarch64/vvc/alf_template.c > > @@ -155,3 +155,90 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, > _neon)(uint8_t *_dst, > > } > > } > > } > > + > > +#define ALF_DIR_VERT 0 > > +#define ALF_DIR_HORZ 1 > > +#define ALF_DIR_DIGA0 2 > > +#define ALF_DIR_DIGA1 3 > > + > > +static void FUNC(ff_alf_get_idx)(int *class_idx, int *transpose_idx, > const int *sum, const int ac) > > +{ > > Static functions don't need an ff_ prefix > > > + static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, > 3, 3, 3, 4 }; > > + > > + int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1; > > + > > + dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ]; > > + hv1 = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]); > > + hv0 = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]); > > + > > + dir_d = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1]; > > + d1 = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]); > > + d0 = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]); > > + > > + //promote to avoid overflow > > + dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0; > > + hvd1 = dir1 ? hv1 : d1; > > + hvd0 = dir1 ? hv0 : d0; > > + > > + sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT]; > > + *class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), > 4)]; > > + if (hvd1 * 2 > 9 * hvd0) > > + *class_idx += ((dir1 << 1) + 2) * 5; > > + else if (hvd1 > 2 * hvd0) > > + *class_idx += ((dir1 << 1) + 1) * 5; > > + > > + *transpose_idx = dir_d * 2 + dir_hv; > > +} > > + > > +static void FUNC(ff_alf_classify)(int *class_idx, int *transpose_idx, > > Ditto > > > + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, > const int height, > > + const int vb_pos, int16_t *gradient_tmp) > > +{ > > + int16_t *grad; > > + > > + const int w = width + ALF_GRADIENT_BORDER * 2; > > + const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / > ALF_GRADIENT_STEP; > > + const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR; > > + const int gshift = gstride - size * ALF_NUM_DIR; > > + > > + for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) { > > + int start = 0; > > + int end = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / > ALF_GRADIENT_STEP; > > + int ac = 2; > > + if (y + ALF_BLOCK_SIZE == vb_pos) { > > + end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP; > > + ac = 3; > > + } else if (y == vb_pos) { > > + start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP; > > + ac = 3; > > + } > > + for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) { > > + const int xg = x / ALF_GRADIENT_STEP; > > + const int yg = y / ALF_GRADIENT_STEP; > > + int sum0[ALF_NUM_DIR]; > > + int sum1[ALF_NUM_DIR]; > > + grad = gradient_tmp + (yg + start) * gstride + xg * > ALF_NUM_DIR; > > + ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start); > > This line seems to be indented differently than the rest > > > + FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum0, ac); > > + class_idx++; > > + transpose_idx++; > > + FUNC(ff_alf_get_idx)(class_idx, transpose_idx, sum1, ac); > > + class_idx++; > > + transpose_idx++; > > + } > > + } > > + > > +} > > + > > +void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int > *transpose_idx, > > + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, > const int height, > > + const int vb_pos, int16_t *gradient_tmp); > > + > > + > > +static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int > *transpose_idx, > > + const uint8_t *_src, const ptrdiff_t _src_stride, const int width, > const int height, > > + const int vb_pos, int *gradient_tmp) { > > Move the opening brace of a function to the next line > > > + FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, > transpose_idx, _src, _src_stride, width, height, vb_pos, > (int16_t*)gradient_tmp); > > This is indented one level too much. With the opening brace on its own > line, that would be clearer to see. > > > If you want to, you can sign up at https://code.ffmpeg.org and submit > your > patch as a PR to https://code.ffmpeg.org/FFmpeg/FFmpeg, for easier > followup/reviewing of your submission. > > > // Martin > > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".