Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter

Zhao Zhili Tue, 29 Apr 2025 00:52:34 -0700


> On Apr 25, 2025, at 16:25, Martin Storsjö <[email protected]> wrote:
> 
> On Tue, 15 Apr 2025, Zhao Zhili wrote:
> 
>> From: Zhao Zhili <[email protected]>
>> 
>> int8_t[] is enough for offset_table of 8 bit streams.
>> 
>> On rpi5:
>>                            Before               After
>> hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)
>> hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)
>> hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)
>> hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)
>> hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)
>> hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)
>> hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)
>> hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)
>> hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)
>> hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)
>> ---
>> libavcodec/aarch64/h26x/dsp.h             |  4 +
>> libavcodec/aarch64/h26x/sao_neon.S        | 93 ++++++++++++++---------
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  4 +-
>> libavcodec/aarch64/vvc/dsp_init.c         |  5 +-
>> 4 files changed, 65 insertions(+), 41 deletions(-)
>> 
>> diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
>> index 0fefb4d70f..6ea6a8d36a 100644
>> --- a/libavcodec/aarch64/h26x/dsp.h
>> +++ b/libavcodec/aarch64/h26x/dsp.h
>> @@ -28,6 +28,10 @@ void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, 
>> const uint8_t *_src,
>>                                        ptrdiff_t stride_dst, ptrdiff_t 
>> stride_src,
>>                                        const int16_t *sao_offset_val, int 
>> sao_left_class,
>>                                        int width, int height);
>> +void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t 
>> *_src,
>> +                                        ptrdiff_t stride_dst, ptrdiff_t 
>> stride_src,
>> +                                        const int16_t *sao_offset_val, int 
>> sao_left_class,
>> +                                        int width, int height);
>> void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, 
>> ptrdiff_t stride_dst,
>>                                          const int16_t *sao_offset_val, int 
>> eo, int width, int height);
>> void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, 
>> ptrdiff_t stride_dst,
>> diff --git a/libavcodec/aarch64/h26x/sao_neon.S 
>> b/libavcodec/aarch64/h26x/sao_neon.S
>> index c43820135e..60c026fe95 100644
>> --- a/libavcodec/aarch64/h26x/sao_neon.S
>> +++ b/libavcodec/aarch64/h26x/sao_neon.S
>> @@ -35,48 +35,67 @@
>> //                      int16_t *sao_offset_val, int sao_left_class,
>> //                      int width, int height)
>> function ff_h26x_sao_band_filter_8x8_8_neon, export=1
>> -        stp             xzr, xzr, [sp, #-64]!
>> +        stp             xzr, xzr, [sp, #-32]!
>>        stp             xzr, xzr, [sp, #16]
>> -        stp             xzr, xzr, [sp, #32]
>> -        stp             xzr, xzr, [sp, #48]
>>        mov             w8,  #4
>> -0:      ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
>> -        subs            w8,  w8,  #1
>> -        add             w10, w8,  w5               // k + sao_left_class
>> +0:
>> +        ldrsh           x9, [x4, x8, lsl #1]        // sao_offset_val[k+1]
>> +        subs            w8, w8, #1
>> +        add             w10, w8, w5                 // k + sao_left_class
>>        and             w10, w10, #0x1F
>> -        strh            w9, [sp, x10, lsl #1]
>> +        strb            w9, [sp, x10]
>>        bne             0b
>> -        add             w6,  w6,  #7
>> -        bic             w6,  w6,  #7
>> -        ld1             {v16.16b-v19.16b}, [sp], #64
>> -        sub             x2,  x2,  x6
>> -        sub             x3,  x3,  x6
>> -        movi            v20.8h,   #1
>> -1:      mov             w8,  w6                    // beginning of line
>> -2:      // Simple layout for accessing 16bit values
>> -        // with 8bit LUT.
>> -        //
>> -        //   00  01  02  03  04  05  06  07
>> -        // +----------------------------------->
>> -        // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
>> -        // +----------------------------------->
>> -        //    i-0     i-1     i-2     i-3
>> -        ld1             {v2.8b}, [x1], #8          // dst[x] = 
>> av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
>> -        subs            w8, w8,  #8
>> -        uxtl            v0.8h,  v2.8b              // load src[x]
>> -        ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
>> -        shl             v1.8h,  v2.8h, #1          // low (x2, accessing 
>> short)
>> -        add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
>> -        sli             v1.8h,  v3.8h, #8          // shift insert index to 
>> upper byte
>> -        tbx             v2.16b, {v16.16b-v19.16b}, v1.16b // table
>> -        add             v1.8h,  v0.8h, v2.8h       // src[x] + table
>> -        sqxtun          v4.8b,  v1.8h              // clip + narrow
>> -        st1             {v4.8b}, [x0], #8          // store
>> -        // done 8 pixels
>> +        ldp             q16, q17, [sp], #32
>> +1:
>> +        ld1             {v2.8b}, [x1], x3
>> +        subs            w7, w7, #1
>> +        uxtl            v0.8h, v2.8b
>> +        ushr            v3.8b, v2.8b, #3          // >> BIT_DEPTH - 3
> 
> Nitpick: The comment on this line seems to be misaligned with the other 
> comments below - please check.


Fixed before push.

> 
>> +        tbx             v3.8b, {v16.16b-v17.16b}, v3.8b
> 
> Is there any specific reason for preferring tbx over tbl here? (I know the 
> existing code used tbx.) Without having studied cycle tables, I would expect 
> tbl to maybe be slightly simpler, but perhaps there's no difference (or tbx 
> is faster)?

tbl can be faster. The result is quite impressive. Changed to tbl before push.

                             Before               tbx             tbl
hevc_sao_band_8_8_c:          252.3 ( 1.00x)     252.3 ( 1.00x)    252.3 ( 
1.00x)
hevc_sao_band_8_8_neon:        95.8 ( 2.63x)      61.0 ( 4.14x)     61.0 ( 
4.57x)
hevc_sao_band_16_8_c:         875.2 ( 1.00x)     864.9 ( 1.00x)    864.9 ( 
1.00x)
hevc_sao_band_16_8_neon:      317.5 ( 2.76x)     150.0 ( 5.76x)    150.0 ( 
6.26x)
hevc_sao_band_32_8_c:        3853.5 ( 1.00x)    3871.6 ( 1.00x)   3871.6 ( 
1.00x)
hevc_sao_band_32_8_neon:     1222.3 ( 3.15x)     550.6 ( 7.03x)    550.6 ( 7.39)
hevc_sao_band_48_8_c:        8203.6 ( 1.00x)    8182.6 ( 1.00x)   8182.6 ( 
1.00x)
hevc_sao_band_48_8_neon:     2685.7 ( 3.05x)    1185.8 ( 6.90x)   1185.8 ( 
7.36x)
hevc_sao_band_64_8_c:       14023.0 ( 1.00x)   14038.9 ( 1.00x)  14038.9 ( 
1.00x)
hevc_sao_band_64_8_neon:     4783.2 ( 2.93x)    2078.4 ( 6.75x)   2078.4 ( 
7.15x)

> 
> 
> Other than these comments, this patch looks good to me, thanks - feel free to 
> push.
> 
> // Martin
> 
> _______________________________________________
> ffmpeg-devel mailing list
> [email protected] <mailto:[email protected]>
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> [email protected] <mailto:[email protected]> with 
> subject "unsubscribe".

_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] aarch64/h26x: optimize sao_band_filter

Reply via email to