Re: [FFmpeg-devel] [PATCH v4] avcodec/aarch64/hevc: add luma deblock NEON

2024-02-28 Thread J. Dekker

Martin Storsjö  writes:

> On Wed, 28 Feb 2024, J. Dekker wrote:
>
>>
>> Martin Storsjö  writes:
>>
>>> On Wed, 28 Feb 2024, J. Dekker wrote:
>>>

 Martin Storsjö  writes:

> On Tue, 27 Feb 2024, J. Dekker wrote:
>
>> Benched using single-threaded full decode on an Ampere Altra.
>>
>> Bpp Before  After  Speedup
>> 8   73,3s   65,2s  1.124x
>> 10  114,2s  104,0s 1.098x
>> 12  125,8s  115,7s 1.087x
>>
>> Signed-off-by: J. Dekker 
>> ---
>>
>> Slightly improved 12bit version.
>>
>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
>> 2 files changed, 435 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> index 8227f65649..581056a91e 100644
>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
>> hevc_v_loop_filter_chroma 8
>> hevc_v_loop_filter_chroma 10
>> hevc_v_loop_filter_chroma 12
>> +
>> +.macro hevc_loop_filter_luma_body bitdepth
>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
>> +.if \bitdepth > 8
>> + lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
>> +.else
>> +uxtlv0.8h, v0.8b
>> +uxtlv1.8h, v1.8b
>> +uxtlv2.8h, v2.8b
>> +uxtlv3.8h, v3.8b
>> +uxtlv4.8h, v4.8b
>> +uxtlv5.8h, v5.8b
>> +uxtlv6.8h, v6.8b
>> +uxtlv7.8h, v7.8b
>> +.endif
>> +ldr w7, [x3] // tc[0]
>> +ldr w8, [x3, #4] // tc[1]
>> +dup v18.4h, w7
>> +dup v19.4h, w8
>> +trn1v18.2d, v18.2d, v19.2d
>> +.if \bitdepth > 8
>> +shl v18.8h, v18.8h, #(\bitdepth - 8)
>> +.endif
>> +dup v27.8h, w2 // beta
>> +// tc25
>> +shl v19.8h, v18.8h, #2 // * 4
>> +add v19.8h, v19.8h, v18.8h // (tc * 5)
>> +srshr   v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
>> +sshrv17.8h, v27.8h, #2 // beta2
>> +
>> +// beta_2 check
>> +// dp0  = abs(P2  - 2 * P1  + P0)
>> +add v22.8h, v3.8h, v1.8h
>> +shl v23.8h, v2.8h, #1
>> +sabdv30.8h, v22.8h, v23.8h
>> +// dq0  = abs(Q2  - 2 * Q1  + Q0)
>> +add v21.8h, v6.8h, v4.8h
>> +shl v26.8h, v5.8h, #1
>> +sabdv31.8h, v21.8h, v26.8h
>> +// d0   = dp0 + dq0
>> +add v20.8h, v30.8h, v31.8h
>> +shl v25.8h, v20.8h, #1
>> +// (d0 << 1) < beta_2
>> +cmgtv23.8h, v17.8h, v25.8h
>> +
>> +// beta check
>> +// d0 + d3 < beta
>> +mov x9, #0x
>> +dup v24.2d, x9
>> +and v25.16b, v24.16b, v20.16b
>> +addpv25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
>> +addpv25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
>> + cmgt v25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
>> +mov w9, v25.s[0]
>
> I don't quite understand what this sequence does and/or how our data is
> laid
> out in our registers - we have d0 on input in v20, where's d3? An doesn't
> the
> "and" throw away half of the input elements here?
>
> I see some similar patterns with the masking and handling below as well -
> I get
> a feeling that I don't quite understand the algorithm here, and/or the
> data
> layout.

 We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
 use pair-wise adds to move our data around and calculate d0+d3
 together. The first addp just moves elements around, the second addp
 adds d0 + 0 + 0 + d3.
>>>
>>> Right, I guess this is the bit that was surprising. I would have expected to
>>> have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD
>>> register, and all the d3 values for all pixels in another SIMD register.
>>>
>>> So as we're operating on 8 pixels in parallel, each of those 8 pixels have
>>> their own d0/d3 values, right? Or is this a case where we have just one
>>> d0/d3
>>> value for a range of pixels?
>>
>> Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are
>> calculated within their own line, d0 from line 0, d3 from line 3. Maybe
>> it's more confusing since we are doing both halves of the 

Re: [FFmpeg-devel] [PATCH v4] avcodec/aarch64/hevc: add luma deblock NEON

2024-02-28 Thread Martin Storsjö

On Wed, 28 Feb 2024, J. Dekker wrote:



Martin Storsjö  writes:


On Wed, 28 Feb 2024, J. Dekker wrote:



Martin Storsjö  writes:


On Tue, 27 Feb 2024, J. Dekker wrote:


Benched using single-threaded full decode on an Ampere Altra.

Bpp Before  After  Speedup
8   73,3s   65,2s  1.124x
10  114,2s  104,0s 1.098x
12  125,8s  115,7s 1.087x

Signed-off-by: J. Dekker 
---

Slightly improved 12bit version.

libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++
libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
2 files changed, 435 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
b/libavcodec/aarch64/hevcdsp_deblock_neon.S
index 8227f65649..581056a91e 100644
--- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
hevc_v_loop_filter_chroma 8
hevc_v_loop_filter_chroma 10
hevc_v_loop_filter_chroma 12
+
+.macro hevc_loop_filter_luma_body bitdepth
+function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
+.if \bitdepth > 8
+lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
+.else
+uxtlv0.8h, v0.8b
+uxtlv1.8h, v1.8b
+uxtlv2.8h, v2.8b
+uxtlv3.8h, v3.8b
+uxtlv4.8h, v4.8b
+uxtlv5.8h, v5.8b
+uxtlv6.8h, v6.8b
+uxtlv7.8h, v7.8b
+.endif
+ldr w7, [x3] // tc[0]
+ldr w8, [x3, #4] // tc[1]
+dup v18.4h, w7
+dup v19.4h, w8
+trn1v18.2d, v18.2d, v19.2d
+.if \bitdepth > 8
+shl v18.8h, v18.8h, #(\bitdepth - 8)
+.endif
+dup v27.8h, w2 // beta
+// tc25
+shl v19.8h, v18.8h, #2 // * 4
+add v19.8h, v19.8h, v18.8h // (tc * 5)
+srshr   v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
+sshrv17.8h, v27.8h, #2 // beta2
+
+// beta_2 check
+// dp0  = abs(P2  - 2 * P1  + P0)
+add v22.8h, v3.8h, v1.8h
+shl v23.8h, v2.8h, #1
+sabdv30.8h, v22.8h, v23.8h
+// dq0  = abs(Q2  - 2 * Q1  + Q0)
+add v21.8h, v6.8h, v4.8h
+shl v26.8h, v5.8h, #1
+sabdv31.8h, v21.8h, v26.8h
+// d0   = dp0 + dq0
+add v20.8h, v30.8h, v31.8h
+shl v25.8h, v20.8h, #1
+// (d0 << 1) < beta_2
+cmgtv23.8h, v17.8h, v25.8h
+
+// beta check
+// d0 + d3 < beta
+mov x9, #0x
+dup v24.2d, x9
+and v25.16b, v24.16b, v20.16b
+addpv25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
+addpv25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
+cmgtv25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
+mov w9, v25.s[0]


I don't quite understand what this sequence does and/or how our data is laid
out in our registers - we have d0 on input in v20, where's d3? An doesn't the
"and" throw away half of the input elements here?

I see some similar patterns with the masking and handling below as well - I get
a feeling that I don't quite understand the algorithm here, and/or the data
layout.


We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
use pair-wise adds to move our data around and calculate d0+d3
together. The first addp just moves elements around, the second addp
adds d0 + 0 + 0 + d3.


Right, I guess this is the bit that was surprising. I would have expected to
have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD
register, and all the d3 values for all pixels in another SIMD register.

So as we're operating on 8 pixels in parallel, each of those 8 pixels have
their own d0/d3 values, right? Or is this a case where we have just one d0/d3
value for a range of pixels?


Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are
calculated within their own line, d0 from line 0, d3 from line 3. Maybe
it's more confusing since we are doing both halves of the filter at the
same time? v20 contains d0 d1 d2 d3 d0 d1 d2 d3, where the second d0 is
distinct from the first.

But essentially we're doing the same operation across the entire 8
lines, the filter just makes an overall skip decision for each block of
4 lines based on the sum of the result from line 0 and 3.


Ah, right, I see. I guess this makes sense then. Thanks!

Thus, no further objections to it; the optimizing of loading/storing can 
be done separately.


// Martin
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with 

Re: [FFmpeg-devel] [PATCH v4] avcodec/aarch64/hevc: add luma deblock NEON

2024-02-28 Thread J. Dekker

Martin Storsjö  writes:

> On Wed, 28 Feb 2024, J. Dekker wrote:
>
>>
>> Martin Storsjö  writes:
>>
>>> On Tue, 27 Feb 2024, J. Dekker wrote:
>>>
 Benched using single-threaded full decode on an Ampere Altra.

 Bpp Before  After  Speedup
 8   73,3s   65,2s  1.124x
 10  114,2s  104,0s 1.098x
 12  125,8s  115,7s 1.087x

 Signed-off-by: J. Dekker 
 ---

 Slightly improved 12bit version.

 libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
 2 files changed, 435 insertions(+)

 diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
 b/libavcodec/aarch64/hevcdsp_deblock_neon.S
 index 8227f65649..581056a91e 100644
 --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
 +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
 @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
 hevc_v_loop_filter_chroma 8
 hevc_v_loop_filter_chroma 10
 hevc_v_loop_filter_chroma 12
 +
 +.macro hevc_loop_filter_luma_body bitdepth
 +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
 +.if \bitdepth > 8
 +lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
 +.else
 +uxtlv0.8h, v0.8b
 +uxtlv1.8h, v1.8b
 +uxtlv2.8h, v2.8b
 +uxtlv3.8h, v3.8b
 +uxtlv4.8h, v4.8b
 +uxtlv5.8h, v5.8b
 +uxtlv6.8h, v6.8b
 +uxtlv7.8h, v7.8b
 +.endif
 +ldr w7, [x3] // tc[0]
 +ldr w8, [x3, #4] // tc[1]
 +dup v18.4h, w7
 +dup v19.4h, w8
 +trn1v18.2d, v18.2d, v19.2d
 +.if \bitdepth > 8
 +shl v18.8h, v18.8h, #(\bitdepth - 8)
 +.endif
 +dup v27.8h, w2 // beta
 +// tc25
 +shl v19.8h, v18.8h, #2 // * 4
 +add v19.8h, v19.8h, v18.8h // (tc * 5)
 +srshr   v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
 +sshrv17.8h, v27.8h, #2 // beta2
 +
 +// beta_2 check
 +// dp0  = abs(P2  - 2 * P1  + P0)
 +add v22.8h, v3.8h, v1.8h
 +shl v23.8h, v2.8h, #1
 +sabdv30.8h, v22.8h, v23.8h
 +// dq0  = abs(Q2  - 2 * Q1  + Q0)
 +add v21.8h, v6.8h, v4.8h
 +shl v26.8h, v5.8h, #1
 +sabdv31.8h, v21.8h, v26.8h
 +// d0   = dp0 + dq0
 +add v20.8h, v30.8h, v31.8h
 +shl v25.8h, v20.8h, #1
 +// (d0 << 1) < beta_2
 +cmgtv23.8h, v17.8h, v25.8h
 +
 +// beta check
 +// d0 + d3 < beta
 +mov x9, #0x
 +dup v24.2d, x9
 +and v25.16b, v24.16b, v20.16b
 +addpv25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
 +addpv25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
 +cmgtv25.4h, v27.4h, v25.4h // lower/upper mask in 
 h[0/1]
 +mov w9, v25.s[0]
>>>
>>> I don't quite understand what this sequence does and/or how our data is laid
>>> out in our registers - we have d0 on input in v20, where's d3? An doesn't 
>>> the
>>> "and" throw away half of the input elements here?
>>>
>>> I see some similar patterns with the masking and handling below as well - I 
>>> get
>>> a feeling that I don't quite understand the algorithm here, and/or the data
>>> layout.
>>
>> We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
>> use pair-wise adds to move our data around and calculate d0+d3
>> together. The first addp just moves elements around, the second addp
>> adds d0 + 0 + 0 + d3.
>
> Right, I guess this is the bit that was surprising. I would have expected to
> have e.g. all the d0 values for e.g. the 8 individual pixels in one SIMD
> register, and all the d3 values for all pixels in another SIMD register.
>
> So as we're operating on 8 pixels in parallel, each of those 8 pixels have
> their own d0/d3 values, right? Or is this a case where we have just one d0/d3
> value for a range of pixels?

Yes, d0/d1/d2/d3 are per 4 lines of 8 pixels, it's because d0 and d3 are
calculated within their own line, d0 from line 0, d3 from line 3. Maybe
it's more confusing since we are doing both halves of the filter at the
same time? v20 contains d0 d1 d2 d3 d0 d1 d2 d3, where the second d0 is
distinct from the first.

But essentially we're doing the same operation across the entire 8
lines, the filter just makes an overall skip decision for each block of
4 lines based on the 

Re: [FFmpeg-devel] [PATCH v4] avcodec/aarch64/hevc: add luma deblock NEON

2024-02-28 Thread Martin Storsjö

On Wed, 28 Feb 2024, J. Dekker wrote:



Martin Storsjö  writes:


On Tue, 27 Feb 2024, J. Dekker wrote:


Benched using single-threaded full decode on an Ampere Altra.

Bpp Before  After  Speedup
8   73,3s   65,2s  1.124x
10  114,2s  104,0s 1.098x
12  125,8s  115,7s 1.087x

Signed-off-by: J. Dekker 
---

Slightly improved 12bit version.

libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++
libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
2 files changed, 435 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
b/libavcodec/aarch64/hevcdsp_deblock_neon.S
index 8227f65649..581056a91e 100644
--- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
hevc_v_loop_filter_chroma 8
hevc_v_loop_filter_chroma 10
hevc_v_loop_filter_chroma 12
+
+.macro hevc_loop_filter_luma_body bitdepth
+function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
+.if \bitdepth > 8
+lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
+.else
+uxtlv0.8h, v0.8b
+uxtlv1.8h, v1.8b
+uxtlv2.8h, v2.8b
+uxtlv3.8h, v3.8b
+uxtlv4.8h, v4.8b
+uxtlv5.8h, v5.8b
+uxtlv6.8h, v6.8b
+uxtlv7.8h, v7.8b
+.endif
+ldr w7, [x3] // tc[0]
+ldr w8, [x3, #4] // tc[1]
+dup v18.4h, w7
+dup v19.4h, w8
+trn1v18.2d, v18.2d, v19.2d
+.if \bitdepth > 8
+shl v18.8h, v18.8h, #(\bitdepth - 8)
+.endif
+dup v27.8h, w2 // beta
+// tc25
+shl v19.8h, v18.8h, #2 // * 4
+add v19.8h, v19.8h, v18.8h // (tc * 5)
+srshr   v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
+sshrv17.8h, v27.8h, #2 // beta2
+
+// beta_2 check
+// dp0  = abs(P2  - 2 * P1  + P0)
+add v22.8h, v3.8h, v1.8h
+shl v23.8h, v2.8h, #1
+sabdv30.8h, v22.8h, v23.8h
+// dq0  = abs(Q2  - 2 * Q1  + Q0)
+add v21.8h, v6.8h, v4.8h
+shl v26.8h, v5.8h, #1
+sabdv31.8h, v21.8h, v26.8h
+// d0   = dp0 + dq0
+add v20.8h, v30.8h, v31.8h
+shl v25.8h, v20.8h, #1
+// (d0 << 1) < beta_2
+cmgtv23.8h, v17.8h, v25.8h
+
+// beta check
+// d0 + d3 < beta
+mov x9, #0x
+dup v24.2d, x9
+and v25.16b, v24.16b, v20.16b
+addpv25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
+addpv25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
+cmgtv25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
+mov w9, v25.s[0]


I don't quite understand what this sequence does and/or how our data is laid
out in our registers - we have d0 on input in v20, where's d3? An doesn't the
"and" throw away half of the input elements here?

I see some similar patterns with the masking and handling below as well - I get
a feeling that I don't quite understand the algorithm here, and/or the data
layout.


We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
use pair-wise adds to move our data around and calculate d0+d3
together. The first addp just moves elements around, the second addp
adds d0 + 0 + 0 + d3.


Right, I guess this is the bit that was surprising. I would have expected 
to have e.g. all the d0 values for e.g. the 8 individual pixels in one 
SIMD register, and all the d3 values for all pixels in another SIMD 
register.


So as we're operating on 8 pixels in parallel, each of those 8 pixels have 
their own d0/d3 values, right? Or is this a case where we have just one 
d0/d3 value for a range of pixels?


// Martin
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v4] avcodec/aarch64/hevc: add luma deblock NEON

2024-02-28 Thread J. Dekker

Martin Storsjö  writes:

> On Tue, 27 Feb 2024, J. Dekker wrote:
>
>> Benched using single-threaded full decode on an Ampere Altra.
>>
>> Bpp Before  After  Speedup
>> 8   73,3s   65,2s  1.124x
>> 10  114,2s  104,0s 1.098x
>> 12  125,8s  115,7s 1.087x
>>
>> Signed-off-by: J. Dekker 
>> ---
>>
>> Slightly improved 12bit version.
>>
>> libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
>> 2 files changed, 435 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
>> b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> index 8227f65649..581056a91e 100644
>> --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
>> @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
>> hevc_v_loop_filter_chroma 8
>> hevc_v_loop_filter_chroma 10
>> hevc_v_loop_filter_chroma 12
>> +
>> +.macro hevc_loop_filter_luma_body bitdepth
>> +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
>> +.if \bitdepth > 8
>> +lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
>> +.else
>> +uxtlv0.8h, v0.8b
>> +uxtlv1.8h, v1.8b
>> +uxtlv2.8h, v2.8b
>> +uxtlv3.8h, v3.8b
>> +uxtlv4.8h, v4.8b
>> +uxtlv5.8h, v5.8b
>> +uxtlv6.8h, v6.8b
>> +uxtlv7.8h, v7.8b
>> +.endif
>> +ldr w7, [x3] // tc[0]
>> +ldr w8, [x3, #4] // tc[1]
>> +dup v18.4h, w7
>> +dup v19.4h, w8
>> +trn1v18.2d, v18.2d, v19.2d
>> +.if \bitdepth > 8
>> +shl v18.8h, v18.8h, #(\bitdepth - 8)
>> +.endif
>> +dup v27.8h, w2 // beta
>> +// tc25
>> +shl v19.8h, v18.8h, #2 // * 4
>> +add v19.8h, v19.8h, v18.8h // (tc * 5)
>> +srshr   v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
>> +sshrv17.8h, v27.8h, #2 // beta2
>> +
>> +// beta_2 check
>> +// dp0  = abs(P2  - 2 * P1  + P0)
>> +add v22.8h, v3.8h, v1.8h
>> +shl v23.8h, v2.8h, #1
>> +sabdv30.8h, v22.8h, v23.8h
>> +// dq0  = abs(Q2  - 2 * Q1  + Q0)
>> +add v21.8h, v6.8h, v4.8h
>> +shl v26.8h, v5.8h, #1
>> +sabdv31.8h, v21.8h, v26.8h
>> +// d0   = dp0 + dq0
>> +add v20.8h, v30.8h, v31.8h
>> +shl v25.8h, v20.8h, #1
>> +// (d0 << 1) < beta_2
>> +cmgtv23.8h, v17.8h, v25.8h
>> +
>> +// beta check
>> +// d0 + d3 < beta
>> +mov x9, #0x
>> +dup v24.2d, x9
>> +and v25.16b, v24.16b, v20.16b
>> +addpv25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
>> +addpv25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
>> +cmgtv25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
>> +mov w9, v25.s[0]
>
> I don't quite understand what this sequence does and/or how our data is laid
> out in our registers - we have d0 on input in v20, where's d3? An doesn't the
> "and" throw away half of the input elements here?
>
> I see some similar patterns with the masking and handling below as well - I 
> get
> a feeling that I don't quite understand the algorithm here, and/or the data
> layout.

We have d0, d1, d2, d3 for both 4 line blocks in v20, mask out d1/d2 and
use pair-wise adds to move our data around and calculate d0+d3
together. The first addp just moves elements around, the second addp
adds d0 + 0 + 0 + d3.

The we can check d0+d3 < beta and use the fact that the compare returns
either 0 or -1 and sign-extend to half the register width for a
mask. This allows us to calculate both 4 line block masks at the same
time in NEON registers.

>> +.if \bitdepth > 8
>> +ld1 {v0.8h}, [x0], x1
>> +ld1 {v1.8h}, [x0], x1
>> +ld1 {v2.8h}, [x0], x1
>> +ld1 {v3.8h}, [x0], x1
>> +ld1 {v4.8h}, [x0], x1
>> +ld1 {v5.8h}, [x0], x1
>> +ld1 {v6.8h}, [x0], x1
>> +ld1 {v7.8h}, [x0]
>> +mov w14, #((1 << \bitdepth) - 1)
>
> For loads like these, we can generally save a bit by using two alternating
> registers for loading, with a double stride - see e.g. the vp9 loop filter
> implementations. But that's a micro optimization.
>
> Other than that, this mostly looks reasaonble.

Will fix on push if no other comments.

-- 
jd
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email

Re: [FFmpeg-devel] [PATCH v4] avcodec/aarch64/hevc: add luma deblock NEON

2024-02-27 Thread Martin Storsjö

On Tue, 27 Feb 2024, J. Dekker wrote:


Benched using single-threaded full decode on an Ampere Altra.

Bpp Before  After  Speedup
8   73,3s   65,2s  1.124x
10  114,2s  104,0s 1.098x
12  125,8s  115,7s 1.087x

Signed-off-by: J. Dekker 
---

Slightly improved 12bit version.

libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++
libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
2 files changed, 435 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
b/libavcodec/aarch64/hevcdsp_deblock_neon.S
index 8227f65649..581056a91e 100644
--- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
hevc_v_loop_filter_chroma 8
hevc_v_loop_filter_chroma 10
hevc_v_loop_filter_chroma 12
+
+.macro hevc_loop_filter_luma_body bitdepth
+function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
+.if \bitdepth > 8
+lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
+.else
+uxtlv0.8h, v0.8b
+uxtlv1.8h, v1.8b
+uxtlv2.8h, v2.8b
+uxtlv3.8h, v3.8b
+uxtlv4.8h, v4.8b
+uxtlv5.8h, v5.8b
+uxtlv6.8h, v6.8b
+uxtlv7.8h, v7.8b
+.endif
+ldr w7, [x3] // tc[0]
+ldr w8, [x3, #4] // tc[1]
+dup v18.4h, w7
+dup v19.4h, w8
+trn1v18.2d, v18.2d, v19.2d
+.if \bitdepth > 8
+shl v18.8h, v18.8h, #(\bitdepth - 8)
+.endif
+dup v27.8h, w2 // beta
+// tc25
+shl v19.8h, v18.8h, #2 // * 4
+add v19.8h, v19.8h, v18.8h // (tc * 5)
+srshr   v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
+sshrv17.8h, v27.8h, #2 // beta2
+
+// beta_2 check
+// dp0  = abs(P2  - 2 * P1  + P0)
+add v22.8h, v3.8h, v1.8h
+shl v23.8h, v2.8h, #1
+sabdv30.8h, v22.8h, v23.8h
+// dq0  = abs(Q2  - 2 * Q1  + Q0)
+add v21.8h, v6.8h, v4.8h
+shl v26.8h, v5.8h, #1
+sabdv31.8h, v21.8h, v26.8h
+// d0   = dp0 + dq0
+add v20.8h, v30.8h, v31.8h
+shl v25.8h, v20.8h, #1
+// (d0 << 1) < beta_2
+cmgtv23.8h, v17.8h, v25.8h
+
+// beta check
+// d0 + d3 < beta
+mov x9, #0x
+dup v24.2d, x9
+and v25.16b, v24.16b, v20.16b
+addpv25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
+addpv25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
+cmgtv25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
+mov w9, v25.s[0]


I don't quite understand what this sequence does and/or how our data is 
laid out in our registers - we have d0 on input in v20, where's d3? An 
doesn't the "and" throw away half of the input elements here?


I see some similar patterns with the masking and handling below as well - 
I get a feeling that I don't quite understand the algorithm here, and/or 
the data layout.



+.if \bitdepth > 8
+ld1 {v0.8h}, [x0], x1
+ld1 {v1.8h}, [x0], x1
+ld1 {v2.8h}, [x0], x1
+ld1 {v3.8h}, [x0], x1
+ld1 {v4.8h}, [x0], x1
+ld1 {v5.8h}, [x0], x1
+ld1 {v6.8h}, [x0], x1
+ld1 {v7.8h}, [x0]
+mov w14, #((1 << \bitdepth) - 1)


For loads like these, we can generally save a bit by using two alternating 
registers for loading, with a double stride - see e.g. the vp9 loop 
filter implementations. But that's a micro optimization.


Other than that, this mostly looks reasaonble.

// Martin

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4] avcodec/aarch64/hevc: add luma deblock NEON

2024-02-27 Thread J. Dekker
Benched using single-threaded full decode on an Ampere Altra.

Bpp Before  After  Speedup
8   73,3s   65,2s  1.124x
10  114,2s  104,0s 1.098x
12  125,8s  115,7s 1.087x

Signed-off-by: J. Dekker 
---

 Slightly improved 12bit version.

 libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  18 +
 2 files changed, 435 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
b/libavcodec/aarch64/hevcdsp_deblock_neon.S
index 8227f65649..581056a91e 100644
--- a/libavcodec/aarch64/hevcdsp_deblock_neon.S
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12
 hevc_v_loop_filter_chroma 8
 hevc_v_loop_filter_chroma 10
 hevc_v_loop_filter_chroma 12
+
+.macro hevc_loop_filter_luma_body bitdepth
+function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0
+.if \bitdepth > 8
+lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8
+.else
+uxtlv0.8h, v0.8b
+uxtlv1.8h, v1.8b
+uxtlv2.8h, v2.8b
+uxtlv3.8h, v3.8b
+uxtlv4.8h, v4.8b
+uxtlv5.8h, v5.8b
+uxtlv6.8h, v6.8b
+uxtlv7.8h, v7.8b
+.endif
+ldr w7, [x3] // tc[0]
+ldr w8, [x3, #4] // tc[1]
+dup v18.4h, w7
+dup v19.4h, w8
+trn1v18.2d, v18.2d, v19.2d
+.if \bitdepth > 8
+shl v18.8h, v18.8h, #(\bitdepth - 8)
+.endif
+dup v27.8h, w2 // beta
+// tc25
+shl v19.8h, v18.8h, #2 // * 4
+add v19.8h, v19.8h, v18.8h // (tc * 5)
+srshr   v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1
+sshrv17.8h, v27.8h, #2 // beta2
+
+// beta_2 check
+// dp0  = abs(P2  - 2 * P1  + P0)
+add v22.8h, v3.8h, v1.8h
+shl v23.8h, v2.8h, #1
+sabdv30.8h, v22.8h, v23.8h
+// dq0  = abs(Q2  - 2 * Q1  + Q0)
+add v21.8h, v6.8h, v4.8h
+shl v26.8h, v5.8h, #1
+sabdv31.8h, v21.8h, v26.8h
+// d0   = dp0 + dq0
+add v20.8h, v30.8h, v31.8h
+shl v25.8h, v20.8h, #1
+// (d0 << 1) < beta_2
+cmgtv23.8h, v17.8h, v25.8h
+
+// beta check
+// d0 + d3 < beta
+mov x9, #0x
+dup v24.2d, x9
+and v25.16b, v24.16b, v20.16b
+addpv25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1
+addpv25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1
+cmgtv25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1]
+mov w9, v25.s[0]
+cmp w9, #0
+sxtlv26.4s, v25.4h
+sxtlv16.2d, v26.2s // full skip mask
+b.eq3f // skip both blocks
+
+// TODO: we can check the full skip mask with the weak/strong mask to
+// potentially skip weak or strong calculation entirely if we only 
have one
+
+// beta_3 check
+// abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3
+sshrv17.8h, v17.8h, #1 // beta_3
+sabdv20.8h, v0.8h, v3.8h
+sabav20.8h, v7.8h, v4.8h
+cmgtv21.8h, v17.8h, v20.8h
+
+and v23.16b, v23.16b, v21.16b
+
+// tc25 check
+// abs(P0  -  Q0) < tc25
+sabdv20.8h, v3.8h, v4.8h
+cmgtv21.8h, v19.8h, v20.8h
+
+and v23.16b, v23.16b, v21.16b
+
+// Generate low/high line max from lines 0/3/4/7
+// mask out lines 2/3/5/6
+not v20.16b, v24.16b // 0x
+orr v23.16b, v23.16b, v20.16b
+
+// generate weak/strong mask
+uminp   v23.8h, v23.8h, v23.8h // extend to singles
+sxtlv23.4s, v23.4h
+uminp   v26.4s, v23.4s, v23.4s // check lines
+// extract to gpr
+ext v25.16b, v26.16b, v26.16b, #2
+zip1v17.4s, v26.4s, v26.4s
+mov w12, v25.s[0]
+mov w11, #0x
+mov w13, #0x
+//   -> strong strong
+//   -> strong weak
+//   -> weak   strong
+//   -> weak   weak
+cmp w12, w13
+b.hi0f // only strong/strong, skip weak nd_p/nd_q calc
+
+// weak nd_p/nd_q
+// d0+d3
+and v30.16b, v30.16b, v24.16b // d0 __ __ d3 d4 __ __ d7
+and v31.16b, v31.16b, v24.16b
+addpv30.8h, v30.8h, v30.8h // [d0+__ __+d3 d4+__ __+d7] [