[FFmpeg-devel] [PATCH v2 3/3] tests/checkasm/vvc_alf: change alf step size to 8

2024-05-30 Thread toqsxw
From: Wu Jianhua 

>From Benjamin Bross:
> for ALF where functions are in increments of 4 while 8 should be sufficient 
> according to the spec.

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_alf.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c
index f35fd2cd3e..84b0f9da15 100644
--- a/tests/checkasm/vvc_alf.c
+++ b/tests/checkasm/vvc_alf.c
@@ -90,8 +90,8 @@ static void check_alf_filter(VVCDSPContext *c, const int 
bit_depth)
 randomize_buffers2(filter, LUMA_PARAMS_SIZE, 1);
 randomize_buffers2(clip, LUMA_PARAMS_SIZE, 0);
 
-for (int h = 4; h <= MAX_CTU_SIZE; h += 4) {
-for (int w = 4; w <= MAX_CTU_SIZE; w += 4) {
+for (int h = 4; h <= MAX_CTU_SIZE; h += 8) {
+for (int w = 4; w <= MAX_CTU_SIZE; w += 8) {
 const int ctu_size = MAX_CTU_SIZE;
 if (check_func(c->alf.filter[LUMA], 
"vvc_alf_filter_luma_%dx%d_%d", w, h, bit_depth)) {
 const int vb_pos = ctu_size - ALF_VB_POS_ABOVE_LUMA;
@@ -142,8 +142,8 @@ static void check_alf_classify(VVCDSPContext *c, const int 
bit_depth)
 
 randomize_buffers(src0, src1, SRC_BUF_SIZE);
 
-for (int h = 4; h <= MAX_CTU_SIZE; h += 4) {
-for (int w = 4; w <= MAX_CTU_SIZE; w += 4) {
+for (int h = 4; h <= MAX_CTU_SIZE; h += 8) {
+for (int w = 4; w <= MAX_CTU_SIZE; w += 8) {
 const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * 
sizeof(int);
 const int vb_pos  = MAX_CTU_SIZE - ALF_BLOCK_SIZE;
 if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, 
bit_depth)) {
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 2/3] avcodec/x86/vvc/vvc_alf: use xq to match ptrdiff_t

2024-05-30 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvc_alf.asm | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm
index f7b3e2a6cc..b35dd9b0e9 100644
--- a/libavcodec/x86/vvc/vvc_alf.asm
+++ b/libavcodec/x86/vvc/vvc_alf.asm
@@ -409,7 +409,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, 
dst_stride, src, src_s
 .loop:
 pushsrcq
 pushdstq
-xor   xd, xd
+xor   xq, xq
 
 .loop_w:
 LOAD_PARAMS
@@ -417,8 +417,8 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, 
dst_stride, src, src_s
 
 add srcq, 16 * ps
 add dstq, 16 * ps
-add   xd, 16
-cmp   xd, widthd
+add   xq, 16
+cmp   xq, widthq
 jl   .loop_w
 
 pop dstq
@@ -427,7 +427,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, 
dst_stride, src, src_s
 lea dstq, [dstq + 4 * dst_strideq]
 
 lea  filterq, [filterq + 2 * strideq]
-leaclipq, [clipq + 2 * strideq]
+leaclipq, [clipq   + 2 * strideq]
 
 sub  vb_posq, 4
 sub  heightq, 4
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 1/3] avcodec/x86/vvc/vvc_alf: fix integer overflow

2024-05-30 Thread toqsxw
From: Wu Jianhua 

Some tests fails with certain seeds

tests/checkasm/checkasm 2325607578 --test=vvc_alf
checkasm: using random seed 2325607578
AVX2:
vvc_alf_filter_luma_120x20_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x24_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x28_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x32_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x36_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x40_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x44_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x48_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x52_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x56_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x60_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x64_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x68_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x72_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x76_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x80_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x84_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x88_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x92_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x96_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x100_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x104_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x108_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x112_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x116_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x120_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x124_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x128_12_avx2 (vvc_alf.c:104)
  - vvc_alf.alf_filter   [FAILED]
  - vvc_alf.alf_classify [OK]
checkasm: 28 of 9216 tests have failed

Reported-by: James Almer 
Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvc_alf.asm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm
index 71e821c27b..f7b3e2a6cc 100644
--- a/libavcodec/x86/vvc/vvc_alf.asm
+++ b/libavcodec/x86/vvc/vvc_alf.asm
@@ -356,7 +356,8 @@ SECTION .text
 
 FILTER_VB xq
 
-paddw m0, m2
+; sum += curr
+paddsw m0, m2
 
 ; clip to pixel
 CLIPW m0, m14, m15
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 3/3] tests/checkasm/vvc_alf: change alf step size to 8

2024-05-29 Thread toqsxw
From: Wu Jianhua 

>From Benjamin Bross:
> for ALF where functions are in increments of 4 while 8 should be sufficient 
> according to the spec.

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_alf.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c
index f35fd2cd3e..84b0f9da15 100644
--- a/tests/checkasm/vvc_alf.c
+++ b/tests/checkasm/vvc_alf.c
@@ -90,8 +90,8 @@ static void check_alf_filter(VVCDSPContext *c, const int 
bit_depth)
 randomize_buffers2(filter, LUMA_PARAMS_SIZE, 1);
 randomize_buffers2(clip, LUMA_PARAMS_SIZE, 0);
 
-for (int h = 4; h <= MAX_CTU_SIZE; h += 4) {
-for (int w = 4; w <= MAX_CTU_SIZE; w += 4) {
+for (int h = 4; h <= MAX_CTU_SIZE; h += 8) {
+for (int w = 4; w <= MAX_CTU_SIZE; w += 8) {
 const int ctu_size = MAX_CTU_SIZE;
 if (check_func(c->alf.filter[LUMA], 
"vvc_alf_filter_luma_%dx%d_%d", w, h, bit_depth)) {
 const int vb_pos = ctu_size - ALF_VB_POS_ABOVE_LUMA;
@@ -142,8 +142,8 @@ static void check_alf_classify(VVCDSPContext *c, const int 
bit_depth)
 
 randomize_buffers(src0, src1, SRC_BUF_SIZE);
 
-for (int h = 4; h <= MAX_CTU_SIZE; h += 4) {
-for (int w = 4; w <= MAX_CTU_SIZE; w += 4) {
+for (int h = 4; h <= MAX_CTU_SIZE; h += 8) {
+for (int w = 4; w <= MAX_CTU_SIZE; w += 8) {
 const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * 
sizeof(int);
 const int vb_pos  = MAX_CTU_SIZE - ALF_BLOCK_SIZE;
 if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, 
bit_depth)) {
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/3] avcodec/x86/vvc/vvc_alf: use xq to match ptrdiff_t

2024-05-29 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvc_alf.asm | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm
index 91f158bac9..8bb698955c 100644
--- a/libavcodec/x86/vvc/vvc_alf.asm
+++ b/libavcodec/x86/vvc/vvc_alf.asm
@@ -421,7 +421,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, 
dst_stride, src, src_s
 .loop:
 pushsrcq
 pushdstq
-xor   xd, xd
+xor   xq, xq
 
 .loop_w:
 LOAD_PARAMS
@@ -429,8 +429,8 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, 
dst_stride, src, src_s
 
 add srcq, 16 * ps
 add dstq, 16 * ps
-add   xd, 16
-cmp   xd, widthd
+add   xq, 16
+cmp   xq, widthq
 jl   .loop_w
 
 pop dstq
@@ -439,7 +439,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, 
dst_stride, src, src_s
 lea dstq, [dstq + 4 * dst_strideq]
 
 lea  filterq, [filterq + 2 * strideq]
-leaclipq, [clipq + 2 * strideq]
+leaclipq, [clipq   + 2 * strideq]
 
 sub  vb_posq, 4
 sub  heightq, 4
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/3] avcodec/x86/vvc/vvc_alf: fix integer overflow

2024-05-29 Thread toqsxw
From: Wu Jianhua 

Some tests fails with certain seeds

tests/checkasm/checkasm 2325607578 --test=vvc_alf
checkasm: using random seed 2325607578
AVX2:
vvc_alf_filter_luma_120x20_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x24_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x28_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x32_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x36_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x40_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x44_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x48_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x52_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x56_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x60_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x64_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x68_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x72_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x76_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x80_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x84_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x88_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x92_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x96_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x100_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x104_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x108_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x112_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x116_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x120_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x124_12_avx2 (vvc_alf.c:104)
vvc_alf_filter_luma_120x128_12_avx2 (vvc_alf.c:104)
  - vvc_alf.alf_filter   [FAILED]
  - vvc_alf.alf_classify [OK]
checkasm: 28 of 9216 tests have failed

Reported-by: James Almer 
Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvc_alf.asm | 13 +
 1 file changed, 13 insertions(+)

diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm
index 71e821c27b..91f158bac9 100644
--- a/libavcodec/x86/vvc/vvc_alf.asm
+++ b/libavcodec/x86/vvc/vvc_alf.asm
@@ -278,7 +278,9 @@ SECTION .text
 psrad m0, SHIFT + 3
 psrad m1, SHIFT + 3
 %%shift_end:
+%if ps == 1
 packssdw  m0, m0, m1
+%endif
 %endmacro
 
 ; FILTER_VB(line)
@@ -356,7 +358,18 @@ SECTION .text
 
 FILTER_VB xq
 
+; sum += curr
+%if ps == 1
 paddw m0, m2
+%else
+vpunpcklqdq  m11, m2, m2
+vpunpckhqdq  m12, m2, m2
+vpunpcklwd   m11, m11, m14
+vpunpcklwd   m12, m12, m14
+paddd m0, m11
+paddd m1, m12
+packssdw  m0, m0, m1
+%endif
 
 ; clip to pixel
 CLIPW m0, m14, m15
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 1/4] avcodec/x86/vvc: add alf filter luma and chroma avx2 optimizations

2024-05-13 Thread toqsxw
From: Wu Jianhua 

ff_vvc_alf_filter_luma_4x4_10_c: 135
ff_vvc_alf_filter_luma_4x4_10_avx2: 54
ff_vvc_alf_filter_luma_4x8_10_c: 268
ff_vvc_alf_filter_luma_4x8_10_avx2: 106
ff_vvc_alf_filter_luma_4x12_10_c: 400
ff_vvc_alf_filter_luma_4x12_10_avx2: 160
ff_vvc_alf_filter_luma_4x16_10_c: 535
ff_vvc_alf_filter_luma_4x16_10_avx2: 213
ff_vvc_alf_filter_luma_4x20_10_c: 646
ff_vvc_alf_filter_luma_4x20_10_avx2: 262
ff_vvc_alf_filter_luma_4x24_10_c: 783
ff_vvc_alf_filter_luma_4x24_10_avx2: 309
ff_vvc_alf_filter_luma_4x28_10_c: 908
ff_vvc_alf_filter_luma_4x28_10_avx2: 361
ff_vvc_alf_filter_luma_4x32_10_c: 1039
ff_vvc_alf_filter_luma_4x32_10_avx2: 412
ff_vvc_alf_filter_luma_8x4_10_c: 260
ff_vvc_alf_filter_luma_8x4_10_avx2: 53
ff_vvc_alf_filter_luma_8x8_10_c: 516
ff_vvc_alf_filter_luma_8x8_10_avx2: 105
ff_vvc_alf_filter_luma_8x12_10_c: 779
ff_vvc_alf_filter_luma_8x12_10_avx2: 157
ff_vvc_alf_filter_luma_8x16_10_c: 1038
ff_vvc_alf_filter_luma_8x16_10_avx2: 210
ff_vvc_alf_filter_luma_8x20_10_c: 1293
ff_vvc_alf_filter_luma_8x20_10_avx2: 259
ff_vvc_alf_filter_luma_8x24_10_c: 1553
ff_vvc_alf_filter_luma_8x24_10_avx2: 309
ff_vvc_alf_filter_luma_8x28_10_c: 1815
ff_vvc_alf_filter_luma_8x28_10_avx2: 361
ff_vvc_alf_filter_luma_8x32_10_c: 2067
ff_vvc_alf_filter_luma_8x32_10_avx2: 419
ff_vvc_alf_filter_luma_12x4_10_c: 390
ff_vvc_alf_filter_luma_12x4_10_avx2: 54
ff_vvc_alf_filter_luma_12x8_10_c: 773
ff_vvc_alf_filter_luma_12x8_10_avx2: 107
ff_vvc_alf_filter_luma_12x12_10_c: 1159
ff_vvc_alf_filter_luma_12x12_10_avx2: 155
ff_vvc_alf_filter_luma_12x16_10_c: 1550
ff_vvc_alf_filter_luma_12x16_10_avx2: 207
ff_vvc_alf_filter_luma_12x20_10_c: 1970
ff_vvc_alf_filter_luma_12x20_10_avx2: 260
ff_vvc_alf_filter_luma_12x24_10_c: 2379
ff_vvc_alf_filter_luma_12x24_10_avx2: 309
ff_vvc_alf_filter_luma_12x28_10_c: 2763
ff_vvc_alf_filter_luma_12x28_10_avx2: 362
ff_vvc_alf_filter_luma_12x32_10_c: 3158
ff_vvc_alf_filter_luma_12x32_10_avx2: 419
ff_vvc_alf_filter_luma_16x4_10_c: 523
ff_vvc_alf_filter_luma_16x4_10_avx2: 53
ff_vvc_alf_filter_luma_16x8_10_c: 1049
ff_vvc_alf_filter_luma_16x8_10_avx2: 103
ff_vvc_alf_filter_luma_16x12_10_c: 1566
ff_vvc_alf_filter_luma_16x12_10_avx2: 159
ff_vvc_alf_filter_luma_16x16_10_c: 2078
ff_vvc_alf_filter_luma_16x16_10_avx2: 211
ff_vvc_alf_filter_luma_16x20_10_c: 2631
ff_vvc_alf_filter_luma_16x20_10_avx2: 259
ff_vvc_alf_filter_luma_16x24_10_c: 3149
ff_vvc_alf_filter_luma_16x24_10_avx2: 316
ff_vvc_alf_filter_luma_16x28_10_c: 3631
ff_vvc_alf_filter_luma_16x28_10_avx2: 359
ff_vvc_alf_filter_luma_16x32_10_c: 4233
ff_vvc_alf_filter_luma_16x32_10_avx2: 428
ff_vvc_alf_filter_luma_20x4_10_c: 649
ff_vvc_alf_filter_luma_20x4_10_avx2: 106
ff_vvc_alf_filter_luma_20x8_10_c: 1294
ff_vvc_alf_filter_luma_20x8_10_avx2: 206
ff_vvc_alf_filter_luma_20x12_10_c: 1936
ff_vvc_alf_filter_luma_20x12_10_avx2: 310
ff_vvc_alf_filter_luma_20x16_10_c: 2594
ff_vvc_alf_filter_luma_20x16_10_avx2: 411
ff_vvc_alf_filter_luma_20x20_10_c: 3234
ff_vvc_alf_filter_luma_20x20_10_avx2: 517
ff_vvc_alf_filter_luma_20x24_10_c: 3894
ff_vvc_alf_filter_luma_20x24_10_avx2: 621
ff_vvc_alf_filter_luma_20x28_10_c: 4542
ff_vvc_alf_filter_luma_20x28_10_avx2: 722
ff_vvc_alf_filter_luma_20x32_10_c: 5205
ff_vvc_alf_filter_luma_20x32_10_avx2: 832
ff_vvc_alf_filter_luma_24x4_10_c: 774
ff_vvc_alf_filter_luma_24x4_10_avx2: 104
ff_vvc_alf_filter_luma_24x8_10_c: 1546
ff_vvc_alf_filter_luma_24x8_10_avx2: 206
ff_vvc_alf_filter_luma_24x12_10_c: 2318
ff_vvc_alf_filter_luma_24x12_10_avx2: 312
ff_vvc_alf_filter_luma_24x16_10_c: 3104
ff_vvc_alf_filter_luma_24x16_10_avx2: 411
ff_vvc_alf_filter_luma_24x20_10_c: 3893
ff_vvc_alf_filter_luma_24x20_10_avx2: 513
ff_vvc_alf_filter_luma_24x24_10_c: 4681
ff_vvc_alf_filter_luma_24x24_10_avx2: 616
ff_vvc_alf_filter_luma_24x28_10_c: 5474
ff_vvc_alf_filter_luma_24x28_10_avx2: 721
ff_vvc_alf_filter_luma_24x32_10_c: 6271
ff_vvc_alf_filter_luma_24x32_10_avx2: 832
ff_vvc_alf_filter_luma_28x4_10_c: 907
ff_vvc_alf_filter_luma_28x4_10_avx2: 103
ff_vvc_alf_filter_luma_28x8_10_c: 1797
ff_vvc_alf_filter_luma_28x8_10_avx2: 206
ff_vvc_alf_filter_luma_28x12_10_c: 2708
ff_vvc_alf_filter_luma_28x12_10_avx2: 309
ff_vvc_alf_filter_luma_28x16_10_c: 3632
ff_vvc_alf_filter_luma_28x16_10_avx2: 413
ff_vvc_alf_filter_luma_28x20_10_c: 4537
ff_vvc_alf_filter_luma_28x20_10_avx2: 519
ff_vvc_alf_filter_luma_28x24_10_c: 5463
ff_vvc_alf_filter_luma_28x24_10_avx2: 616
ff_vvc_alf_filter_luma_28x28_10_c: 6372
ff_vvc_alf_filter_luma_28x28_10_avx2: 719
ff_vvc_alf_filter_luma_28x32_10_c: 7274
ff_vvc_alf_filter_luma_28x32_10_avx2: 823
ff_vvc_alf_filter_luma_32x4_10_c: 1029
ff_vvc_alf_filter_luma_32x4_10_avx2: 104
ff_vvc_alf_filter_luma_32x8_10_c: 2060
ff_vvc_alf_filter_luma_32x8_10_avx2: 206
ff_vvc_alf_filter_luma_32x12_10_c: 3112
ff_vvc_alf_filter_luma_32x12_10_avx2: 307
ff_vvc_alf_filter_luma_32x16_10_c: 4161
ff_vvc_alf_filter_luma_32x16_10_avx2: 413
ff_vvc_alf_filter_luma_32x20_10_c: 5211
ff_vvc_alf_filter_luma_32x20_10_avx2: 514
ff_vvc_alf_filter_luma_32x24_10_c: 6238

[FFmpeg-devel] [PATCH v3 4/4] tests/checkasm/vvc_alf: add check_alf_classify

2024-05-13 Thread toqsxw
From: Wu Jianhua 

Perforamnce Test (fps):
clip  before  after delta
Tango2_3840x2160_60_10_420_27_LD.266  56  115   105.36%
RitualDance_1920x1080_60_10_420_32_LD.266 272 481   76.83%
RitualDance_1920x1080_60_10_420_37_RA.266 303 426   40.59%

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_alf.c | 47 
 1 file changed, 47 insertions(+)

diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c
index 10469e1528..9526260598 100644
--- a/tests/checkasm/vvc_alf.c
+++ b/tests/checkasm/vvc_alf.c
@@ -121,6 +121,47 @@ static void check_alf_filter(VVCDSPContext *c, const int 
bit_depth)
 }
 }
 
+static void check_alf_classify(VVCDSPContext *c, const int bit_depth)
+{
+LOCAL_ALIGNED_32(int, class_idx0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, transpose_idx0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, class_idx1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, transpose_idx1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int32_t, alf_gradient_tmp, [ALF_GRADIENT_SIZE * 
ALF_GRADIENT_SIZE * ALF_NUM_DIR]);
+
+ptrdiff_t stride = SRC_PIXEL_STRIDE * SIZEOF_PIXEL;
+int offset = (3 * SRC_PIXEL_STRIDE + 3) * SIZEOF_PIXEL;
+
+declare_func_emms(AV_CPU_FLAG_AVX2, void, int *class_idx, int 
*transpose_idx,
+const uint8_t *src, ptrdiff_t src_stride, int width, int height, int 
vb_pos, int *gradient_tmp);
+
+randomize_buffers(src0, src1, SRC_BUF_SIZE);
+
+for (int h = 4; h <= MAX_CTU_SIZE; h += 4) {
+for (int w = 4; w <= MAX_CTU_SIZE; w += 4) {
+const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * 
sizeof(int);
+const int vb_pos  = MAX_CTU_SIZE - ALF_BLOCK_SIZE;
+if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, 
bit_depth)) {
+memset(class_idx0, 0, id_size);
+memset(class_idx1, 0, id_size);
+memset(transpose_idx0, 0, id_size);
+memset(transpose_idx1, 0, id_size);
+call_ref(class_idx0, transpose_idx0, src0 + offset, stride, w, 
h, vb_pos, alf_gradient_tmp);
+
+call_new(class_idx1, transpose_idx1, src1 + offset, stride, w, 
h, vb_pos, alf_gradient_tmp);
+
+if (memcmp(class_idx0, class_idx1, id_size))
+fail();
+if (memcmp(transpose_idx0, transpose_idx1, id_size))
+fail();
+bench_new(class_idx1, transpose_idx1, src1 + offset, stride, 
w, h, vb_pos, alf_gradient_tmp);
+}
+}
+}
+}
+
 void checkasm_check_vvc_alf(void)
 {
 int bit_depth;
@@ -130,4 +171,10 @@ void checkasm_check_vvc_alf(void)
 check_alf_filter(, bit_depth);
 }
 report("alf_filter");
+
+for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ff_vvc_dsp_init(, bit_depth);
+check_alf_classify(, bit_depth);
+}
+report("alf_classify");
 }
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 2/4] tests/checkasm: add checkasm_check_vvc_alf and check_alf_filter

2024-05-13 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/Makefile   |   2 +-
 tests/checkasm/checkasm.c |   3 +-
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_alf.c  | 133 ++
 4 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 tests/checkasm/vvc_alf.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index b5bb885201..92624aab0a 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -43,7 +43,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
-AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_mc.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_alf.o vvc_mc.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 04f94f9d09..ffc89882b1 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -204,7 +204,8 @@ static const struct {
 { "vorbisdsp", checkasm_check_vorbisdsp },
 #endif
 #if CONFIG_VVC_DECODER
-{ "vvc_mc", checkasm_check_vvc_mc },
+{ "vvc_alf", checkasm_check_vvc_alf },
+{ "vvc_mc",  checkasm_check_vvc_mc  },
 #endif
 #endif
 #if CONFIG_AVFILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 8807a37a43..07fcc751ff 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -134,6 +134,7 @@ void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_alf(void);
 void checkasm_check_vvc_mc(void);
 
 struct CheckasmPerf;
diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c
new file mode 100644
index 00..10469e1528
--- /dev/null
+++ b/tests/checkasm/vvc_alf.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi 
+ * Copyright (c) 2023-2024 Wu Jianhua 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+#include "libavcodec/vvc/ctu.h"
+#include "libavcodec/vvc/data.h"
+#include "libavcodec/vvc/dsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define SRC_PIXEL_STRIDE (MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE)
+#define DST_PIXEL_STRIDE (SRC_PIXEL_STRIDE + 4)
+#define SRC_BUF_SIZE (SRC_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) //+3 * 2 
for top and bottom row, *2 for high bit depth
+#define DST_BUF_SIZE (DST_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2)
+#define LUMA_PARAMS_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE / 
ALF_BLOCK_SIZE * ALF_NUM_COEFF_LUMA)
+
+#define randomize_buffers(buf0, buf1, size) \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+#define randomize_buffers2(buf, size, filter)   \
+do {\
+int k;  \
+if (filter) {   \
+for (k = 0; k < size; k++) {\
+int8_t r = rnd();   \
+buf[k] = r; \
+}   \
+} else {\
+for (k = 0; k < size; k++) {\
+int r = rnd() % FF_ARRAY_ELEMS(clip_set);   \
+buf[k] = clip_set[r];   \
+}   

[FFmpeg-devel] [PATCH v2 4/4] tests/checkasm/vvc_alf: add check_alf_classify

2024-05-01 Thread toqsxw
From: Wu Jianhua 

Perforamnce Test (fps):
clip  before  after delta
Tango2_3840x2160_60_10_420_27_LD.266  56  115   105.36%
RitualDance_1920x1080_60_10_420_32_LD.266 272 481   76.83%
RitualDance_1920x1080_60_10_420_37_RA.266 303 426   40.59%

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_alf.c | 47 
 1 file changed, 47 insertions(+)

diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c
index 10469e1528..9526260598 100644
--- a/tests/checkasm/vvc_alf.c
+++ b/tests/checkasm/vvc_alf.c
@@ -121,6 +121,47 @@ static void check_alf_filter(VVCDSPContext *c, const int 
bit_depth)
 }
 }
 
+static void check_alf_classify(VVCDSPContext *c, const int bit_depth)
+{
+LOCAL_ALIGNED_32(int, class_idx0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, transpose_idx0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, class_idx1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, transpose_idx1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int32_t, alf_gradient_tmp, [ALF_GRADIENT_SIZE * 
ALF_GRADIENT_SIZE * ALF_NUM_DIR]);
+
+ptrdiff_t stride = SRC_PIXEL_STRIDE * SIZEOF_PIXEL;
+int offset = (3 * SRC_PIXEL_STRIDE + 3) * SIZEOF_PIXEL;
+
+declare_func_emms(AV_CPU_FLAG_AVX2, void, int *class_idx, int 
*transpose_idx,
+const uint8_t *src, ptrdiff_t src_stride, int width, int height, int 
vb_pos, int *gradient_tmp);
+
+randomize_buffers(src0, src1, SRC_BUF_SIZE);
+
+for (int h = 4; h <= MAX_CTU_SIZE; h += 4) {
+for (int w = 4; w <= MAX_CTU_SIZE; w += 4) {
+const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * 
sizeof(int);
+const int vb_pos  = MAX_CTU_SIZE - ALF_BLOCK_SIZE;
+if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, 
bit_depth)) {
+memset(class_idx0, 0, id_size);
+memset(class_idx1, 0, id_size);
+memset(transpose_idx0, 0, id_size);
+memset(transpose_idx1, 0, id_size);
+call_ref(class_idx0, transpose_idx0, src0 + offset, stride, w, 
h, vb_pos, alf_gradient_tmp);
+
+call_new(class_idx1, transpose_idx1, src1 + offset, stride, w, 
h, vb_pos, alf_gradient_tmp);
+
+if (memcmp(class_idx0, class_idx1, id_size))
+fail();
+if (memcmp(transpose_idx0, transpose_idx1, id_size))
+fail();
+bench_new(class_idx1, transpose_idx1, src1 + offset, stride, 
w, h, vb_pos, alf_gradient_tmp);
+}
+}
+}
+}
+
 void checkasm_check_vvc_alf(void)
 {
 int bit_depth;
@@ -130,4 +171,10 @@ void checkasm_check_vvc_alf(void)
 check_alf_filter(, bit_depth);
 }
 report("alf_filter");
+
+for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ff_vvc_dsp_init(, bit_depth);
+check_alf_classify(, bit_depth);
+}
+report("alf_classify");
 }
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 2/4] tests/checkasm: add checkasm_check_vvc_alf and check_alf_filter

2024-05-01 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/Makefile   |   2 +-
 tests/checkasm/checkasm.c |   3 +-
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_alf.c  | 133 ++
 4 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 tests/checkasm/vvc_alf.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 2673e1d098..5a3e3985c4 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -41,7 +41,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
-AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_mc.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_alf.o vvc_mc.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 8be6cb0f55..8b2bf2827b 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -198,7 +198,8 @@ static const struct {
 { "vorbisdsp", checkasm_check_vorbisdsp },
 #endif
 #if CONFIG_VVC_DECODER
-{ "vvc_mc", checkasm_check_vvc_mc },
+{ "vvc_alf", checkasm_check_vvc_alf },
+{ "vvc_mc",  checkasm_check_vvc_mc  },
 #endif
 #endif
 #if CONFIG_AVFILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index f90920dee7..c6a5cf42dd 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -132,6 +132,7 @@ void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_alf(void);
 void checkasm_check_vvc_mc(void);
 
 struct CheckasmPerf;
diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c
new file mode 100644
index 00..10469e1528
--- /dev/null
+++ b/tests/checkasm/vvc_alf.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi 
+ * Copyright (c) 2023-2024 Wu Jianhua 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+#include "libavcodec/vvc/ctu.h"
+#include "libavcodec/vvc/data.h"
+#include "libavcodec/vvc/dsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define SRC_PIXEL_STRIDE (MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE)
+#define DST_PIXEL_STRIDE (SRC_PIXEL_STRIDE + 4)
+#define SRC_BUF_SIZE (SRC_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) //+3 * 2 
for top and bottom row, *2 for high bit depth
+#define DST_BUF_SIZE (DST_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2)
+#define LUMA_PARAMS_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE / 
ALF_BLOCK_SIZE * ALF_NUM_COEFF_LUMA)
+
+#define randomize_buffers(buf0, buf1, size) \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+#define randomize_buffers2(buf, size, filter)   \
+do {\
+int k;  \
+if (filter) {   \
+for (k = 0; k < size; k++) {\
+int8_t r = rnd();   \
+buf[k] = r; \
+}   \
+} else {\
+for (k = 0; k < size; k++) {\
+int r = rnd() % FF_ARRAY_ELEMS(clip_set);   \
+buf[k] = clip_set[r];   \
+}   

[FFmpeg-devel] [PATCH v2 1/4] avcodec/x86/vvc: add alf filter luma and chroma avx2 optimizations

2024-05-01 Thread toqsxw
From: Wu Jianhua 

ff_vvc_alf_filter_luma_4x4_10_c: 135
ff_vvc_alf_filter_luma_4x4_10_avx2: 54
ff_vvc_alf_filter_luma_4x8_10_c: 268
ff_vvc_alf_filter_luma_4x8_10_avx2: 106
ff_vvc_alf_filter_luma_4x12_10_c: 400
ff_vvc_alf_filter_luma_4x12_10_avx2: 160
ff_vvc_alf_filter_luma_4x16_10_c: 535
ff_vvc_alf_filter_luma_4x16_10_avx2: 213
ff_vvc_alf_filter_luma_4x20_10_c: 646
ff_vvc_alf_filter_luma_4x20_10_avx2: 262
ff_vvc_alf_filter_luma_4x24_10_c: 783
ff_vvc_alf_filter_luma_4x24_10_avx2: 309
ff_vvc_alf_filter_luma_4x28_10_c: 908
ff_vvc_alf_filter_luma_4x28_10_avx2: 361
ff_vvc_alf_filter_luma_4x32_10_c: 1039
ff_vvc_alf_filter_luma_4x32_10_avx2: 412
ff_vvc_alf_filter_luma_8x4_10_c: 260
ff_vvc_alf_filter_luma_8x4_10_avx2: 53
ff_vvc_alf_filter_luma_8x8_10_c: 516
ff_vvc_alf_filter_luma_8x8_10_avx2: 105
ff_vvc_alf_filter_luma_8x12_10_c: 779
ff_vvc_alf_filter_luma_8x12_10_avx2: 157
ff_vvc_alf_filter_luma_8x16_10_c: 1038
ff_vvc_alf_filter_luma_8x16_10_avx2: 210
ff_vvc_alf_filter_luma_8x20_10_c: 1293
ff_vvc_alf_filter_luma_8x20_10_avx2: 259
ff_vvc_alf_filter_luma_8x24_10_c: 1553
ff_vvc_alf_filter_luma_8x24_10_avx2: 309
ff_vvc_alf_filter_luma_8x28_10_c: 1815
ff_vvc_alf_filter_luma_8x28_10_avx2: 361
ff_vvc_alf_filter_luma_8x32_10_c: 2067
ff_vvc_alf_filter_luma_8x32_10_avx2: 419
ff_vvc_alf_filter_luma_12x4_10_c: 390
ff_vvc_alf_filter_luma_12x4_10_avx2: 54
ff_vvc_alf_filter_luma_12x8_10_c: 773
ff_vvc_alf_filter_luma_12x8_10_avx2: 107
ff_vvc_alf_filter_luma_12x12_10_c: 1159
ff_vvc_alf_filter_luma_12x12_10_avx2: 155
ff_vvc_alf_filter_luma_12x16_10_c: 1550
ff_vvc_alf_filter_luma_12x16_10_avx2: 207
ff_vvc_alf_filter_luma_12x20_10_c: 1970
ff_vvc_alf_filter_luma_12x20_10_avx2: 260
ff_vvc_alf_filter_luma_12x24_10_c: 2379
ff_vvc_alf_filter_luma_12x24_10_avx2: 309
ff_vvc_alf_filter_luma_12x28_10_c: 2763
ff_vvc_alf_filter_luma_12x28_10_avx2: 362
ff_vvc_alf_filter_luma_12x32_10_c: 3158
ff_vvc_alf_filter_luma_12x32_10_avx2: 419
ff_vvc_alf_filter_luma_16x4_10_c: 523
ff_vvc_alf_filter_luma_16x4_10_avx2: 53
ff_vvc_alf_filter_luma_16x8_10_c: 1049
ff_vvc_alf_filter_luma_16x8_10_avx2: 103
ff_vvc_alf_filter_luma_16x12_10_c: 1566
ff_vvc_alf_filter_luma_16x12_10_avx2: 159
ff_vvc_alf_filter_luma_16x16_10_c: 2078
ff_vvc_alf_filter_luma_16x16_10_avx2: 211
ff_vvc_alf_filter_luma_16x20_10_c: 2631
ff_vvc_alf_filter_luma_16x20_10_avx2: 259
ff_vvc_alf_filter_luma_16x24_10_c: 3149
ff_vvc_alf_filter_luma_16x24_10_avx2: 316
ff_vvc_alf_filter_luma_16x28_10_c: 3631
ff_vvc_alf_filter_luma_16x28_10_avx2: 359
ff_vvc_alf_filter_luma_16x32_10_c: 4233
ff_vvc_alf_filter_luma_16x32_10_avx2: 428
ff_vvc_alf_filter_luma_20x4_10_c: 649
ff_vvc_alf_filter_luma_20x4_10_avx2: 106
ff_vvc_alf_filter_luma_20x8_10_c: 1294
ff_vvc_alf_filter_luma_20x8_10_avx2: 206
ff_vvc_alf_filter_luma_20x12_10_c: 1936
ff_vvc_alf_filter_luma_20x12_10_avx2: 310
ff_vvc_alf_filter_luma_20x16_10_c: 2594
ff_vvc_alf_filter_luma_20x16_10_avx2: 411
ff_vvc_alf_filter_luma_20x20_10_c: 3234
ff_vvc_alf_filter_luma_20x20_10_avx2: 517
ff_vvc_alf_filter_luma_20x24_10_c: 3894
ff_vvc_alf_filter_luma_20x24_10_avx2: 621
ff_vvc_alf_filter_luma_20x28_10_c: 4542
ff_vvc_alf_filter_luma_20x28_10_avx2: 722
ff_vvc_alf_filter_luma_20x32_10_c: 5205
ff_vvc_alf_filter_luma_20x32_10_avx2: 832
ff_vvc_alf_filter_luma_24x4_10_c: 774
ff_vvc_alf_filter_luma_24x4_10_avx2: 104
ff_vvc_alf_filter_luma_24x8_10_c: 1546
ff_vvc_alf_filter_luma_24x8_10_avx2: 206
ff_vvc_alf_filter_luma_24x12_10_c: 2318
ff_vvc_alf_filter_luma_24x12_10_avx2: 312
ff_vvc_alf_filter_luma_24x16_10_c: 3104
ff_vvc_alf_filter_luma_24x16_10_avx2: 411
ff_vvc_alf_filter_luma_24x20_10_c: 3893
ff_vvc_alf_filter_luma_24x20_10_avx2: 513
ff_vvc_alf_filter_luma_24x24_10_c: 4681
ff_vvc_alf_filter_luma_24x24_10_avx2: 616
ff_vvc_alf_filter_luma_24x28_10_c: 5474
ff_vvc_alf_filter_luma_24x28_10_avx2: 721
ff_vvc_alf_filter_luma_24x32_10_c: 6271
ff_vvc_alf_filter_luma_24x32_10_avx2: 832
ff_vvc_alf_filter_luma_28x4_10_c: 907
ff_vvc_alf_filter_luma_28x4_10_avx2: 103
ff_vvc_alf_filter_luma_28x8_10_c: 1797
ff_vvc_alf_filter_luma_28x8_10_avx2: 206
ff_vvc_alf_filter_luma_28x12_10_c: 2708
ff_vvc_alf_filter_luma_28x12_10_avx2: 309
ff_vvc_alf_filter_luma_28x16_10_c: 3632
ff_vvc_alf_filter_luma_28x16_10_avx2: 413
ff_vvc_alf_filter_luma_28x20_10_c: 4537
ff_vvc_alf_filter_luma_28x20_10_avx2: 519
ff_vvc_alf_filter_luma_28x24_10_c: 5463
ff_vvc_alf_filter_luma_28x24_10_avx2: 616
ff_vvc_alf_filter_luma_28x28_10_c: 6372
ff_vvc_alf_filter_luma_28x28_10_avx2: 719
ff_vvc_alf_filter_luma_28x32_10_c: 7274
ff_vvc_alf_filter_luma_28x32_10_avx2: 823
ff_vvc_alf_filter_luma_32x4_10_c: 1029
ff_vvc_alf_filter_luma_32x4_10_avx2: 104
ff_vvc_alf_filter_luma_32x8_10_c: 2060
ff_vvc_alf_filter_luma_32x8_10_avx2: 206
ff_vvc_alf_filter_luma_32x12_10_c: 3112
ff_vvc_alf_filter_luma_32x12_10_avx2: 307
ff_vvc_alf_filter_luma_32x16_10_c: 4161
ff_vvc_alf_filter_luma_32x16_10_avx2: 413
ff_vvc_alf_filter_luma_32x20_10_c: 5211
ff_vvc_alf_filter_luma_32x20_10_avx2: 514
ff_vvc_alf_filter_luma_32x24_10_c: 6238

[FFmpeg-devel] [PATCH 4/4] tests/checkasm/vvc_alf: add check_alf_classify

2024-04-29 Thread toqsxw
From: Wu Jianhua 

Perforamnce Test:
clip before (fps)after (fps)
delta
Tango2_3840x2160_60_10_420_27_LD.266 56  115
105.36%
RitualDance_1920x1080_60_10_420_32_LD.266272 481 
76.83%
RitualDance_1920x1080_60_10_420_37_RA.266303 426 
40.59%

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_alf.c | 47 
 1 file changed, 47 insertions(+)

diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c
index 10469e1528..9526260598 100644
--- a/tests/checkasm/vvc_alf.c
+++ b/tests/checkasm/vvc_alf.c
@@ -121,6 +121,47 @@ static void check_alf_filter(VVCDSPContext *c, const int 
bit_depth)
 }
 }
 
+static void check_alf_classify(VVCDSPContext *c, const int bit_depth)
+{
+LOCAL_ALIGNED_32(int, class_idx0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, transpose_idx0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, class_idx1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int, transpose_idx1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int32_t, alf_gradient_tmp, [ALF_GRADIENT_SIZE * 
ALF_GRADIENT_SIZE * ALF_NUM_DIR]);
+
+ptrdiff_t stride = SRC_PIXEL_STRIDE * SIZEOF_PIXEL;
+int offset = (3 * SRC_PIXEL_STRIDE + 3) * SIZEOF_PIXEL;
+
+declare_func_emms(AV_CPU_FLAG_AVX2, void, int *class_idx, int 
*transpose_idx,
+const uint8_t *src, ptrdiff_t src_stride, int width, int height, int 
vb_pos, int *gradient_tmp);
+
+randomize_buffers(src0, src1, SRC_BUF_SIZE);
+
+for (int h = 4; h <= MAX_CTU_SIZE; h += 4) {
+for (int w = 4; w <= MAX_CTU_SIZE; w += 4) {
+const int id_size = w * h / ALF_BLOCK_SIZE / ALF_BLOCK_SIZE * 
sizeof(int);
+const int vb_pos  = MAX_CTU_SIZE - ALF_BLOCK_SIZE;
+if (check_func(c->alf.classify, "vvc_alf_classify_%dx%d_%d", w, h, 
bit_depth)) {
+memset(class_idx0, 0, id_size);
+memset(class_idx1, 0, id_size);
+memset(transpose_idx0, 0, id_size);
+memset(transpose_idx1, 0, id_size);
+call_ref(class_idx0, transpose_idx0, src0 + offset, stride, w, 
h, vb_pos, alf_gradient_tmp);
+
+call_new(class_idx1, transpose_idx1, src1 + offset, stride, w, 
h, vb_pos, alf_gradient_tmp);
+
+if (memcmp(class_idx0, class_idx1, id_size))
+fail();
+if (memcmp(transpose_idx0, transpose_idx1, id_size))
+fail();
+bench_new(class_idx1, transpose_idx1, src1 + offset, stride, 
w, h, vb_pos, alf_gradient_tmp);
+}
+}
+}
+}
+
 void checkasm_check_vvc_alf(void)
 {
 int bit_depth;
@@ -130,4 +171,10 @@ void checkasm_check_vvc_alf(void)
 check_alf_filter(, bit_depth);
 }
 report("alf_filter");
+
+for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+ff_vvc_dsp_init(, bit_depth);
+check_alf_classify(, bit_depth);
+}
+report("alf_classify");
 }
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/4] tests/checkasm: add checkasm_check_vvc_alf and check_alf_filter

2024-04-29 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/Makefile   |   2 +-
 tests/checkasm/checkasm.c |   3 +-
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_alf.c  | 133 ++
 4 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 tests/checkasm/vvc_alf.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 2673e1d098..5a3e3985c4 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -41,7 +41,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
-AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_mc.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_alf.o vvc_mc.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 8be6cb0f55..8b2bf2827b 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -198,7 +198,8 @@ static const struct {
 { "vorbisdsp", checkasm_check_vorbisdsp },
 #endif
 #if CONFIG_VVC_DECODER
-{ "vvc_mc", checkasm_check_vvc_mc },
+{ "vvc_alf", checkasm_check_vvc_alf },
+{ "vvc_mc",  checkasm_check_vvc_mc  },
 #endif
 #endif
 #if CONFIG_AVFILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index f90920dee7..c6a5cf42dd 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -132,6 +132,7 @@ void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_alf(void);
 void checkasm_check_vvc_mc(void);
 
 struct CheckasmPerf;
diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c
new file mode 100644
index 00..10469e1528
--- /dev/null
+++ b/tests/checkasm/vvc_alf.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi 
+ * Copyright (c) 2023-2024 Wu Jianhua 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+#include "libavcodec/vvc/ctu.h"
+#include "libavcodec/vvc/data.h"
+#include "libavcodec/vvc/dsp.h"
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff };
+
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
+#define SRC_PIXEL_STRIDE (MAX_CTU_SIZE + 2 * ALF_PADDING_SIZE)
+#define DST_PIXEL_STRIDE (SRC_PIXEL_STRIDE + 4)
+#define SRC_BUF_SIZE (SRC_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2) //+3 * 2 
for top and bottom row, *2 for high bit depth
+#define DST_BUF_SIZE (DST_PIXEL_STRIDE * (MAX_CTU_SIZE + 3 * 2) * 2)
+#define LUMA_PARAMS_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE / ALF_BLOCK_SIZE / 
ALF_BLOCK_SIZE * ALF_NUM_COEFF_LUMA)
+
+#define randomize_buffers(buf0, buf1, size) \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+#define randomize_buffers2(buf, size, filter)   \
+do {\
+int k;  \
+if (filter) {   \
+for (k = 0; k < size; k++) {\
+int8_t r = rnd();   \
+buf[k] = r; \
+}   \
+} else {\
+for (k = 0; k < size; k++) {\
+int r = rnd() % FF_ARRAY_ELEMS(clip_set);   \
+buf[k] = clip_set[r];   \
+}   

[FFmpeg-devel] [PATCH 1/4] avcodec/x86/vvc: add alf filter luma and chroma avx2 optimizations

2024-04-29 Thread toqsxw
From: Wu Jianhua 

vvc_alf_filter_chroma_4x4_10_c: 657.0
vvc_alf_filter_chroma_4x4_10_avx2: 138.0
vvc_alf_filter_chroma_4x8_10_c: 1264.7
vvc_alf_filter_chroma_4x8_10_avx2: 253.5
vvc_alf_filter_chroma_4x12_10_c: 1841.7
vvc_alf_filter_chroma_4x12_10_avx2: 375.5
vvc_alf_filter_chroma_4x16_10_c: 2442.7
vvc_alf_filter_chroma_4x16_10_avx2: 491.7
vvc_alf_filter_chroma_4x20_10_c: 3057.0
vvc_alf_filter_chroma_4x20_10_avx2: 607.2
vvc_alf_filter_chroma_4x24_10_c: 3667.0
vvc_alf_filter_chroma_4x24_10_avx2: 747.5
vvc_alf_filter_chroma_4x28_10_c: 4286.7
vvc_alf_filter_chroma_4x28_10_avx2: 849.0
vvc_alf_filter_chroma_4x32_10_c: 4886.0
vvc_alf_filter_chroma_4x32_10_avx2: 967.5
vvc_alf_filter_chroma_8x4_10_c: 1250.5
vvc_alf_filter_chroma_8x4_10_avx2: 261.0
vvc_alf_filter_chroma_8x8_10_c: 2430.7
vvc_alf_filter_chroma_8x8_10_avx2: 494.7
vvc_alf_filter_chroma_8x12_10_c: 3631.2
vvc_alf_filter_chroma_8x12_10_avx2: 734.5
vvc_alf_filter_chroma_8x16_10_c: 13675.7
vvc_alf_filter_chroma_8x16_10_avx2: 972.0
vvc_alf_filter_chroma_8x20_10_c: 6212.0
vvc_alf_filter_chroma_8x20_10_avx2: 1211.0
vvc_alf_filter_chroma_8x24_10_c: 7440.7
vvc_alf_filter_chroma_8x24_10_avx2: 1447.0
vvc_alf_filter_chroma_8x28_10_c: 8460.5
vvc_alf_filter_chroma_8x28_10_avx2: 1682.5
vvc_alf_filter_chroma_8x32_10_c: 9665.2
vvc_alf_filter_chroma_8x32_10_avx2: 1917.7
vvc_alf_filter_chroma_12x4_10_c: 1865.2
vvc_alf_filter_chroma_12x4_10_avx2: 391.7
vvc_alf_filter_chroma_12x8_10_c: 3625.2
vvc_alf_filter_chroma_12x8_10_avx2: 739.0
vvc_alf_filter_chroma_12x12_10_c: 5427.5
vvc_alf_filter_chroma_12x12_10_avx2: 1094.2
vvc_alf_filter_chroma_12x16_10_c: 7237.7
vvc_alf_filter_chroma_12x16_10_avx2: 1447.2
vvc_alf_filter_chroma_12x20_10_c: 9035.2
vvc_alf_filter_chroma_12x20_10_avx2: 1805.2
vvc_alf_filter_chroma_12x24_10_c: 11135.7
vvc_alf_filter_chroma_12x24_10_avx2: 2158.2
vvc_alf_filter_chroma_12x28_10_c: 12644.0
vvc_alf_filter_chroma_12x28_10_avx2: 2511.2
vvc_alf_filter_chroma_12x32_10_c: 14441.7
vvc_alf_filter_chroma_12x32_10_avx2: 2888.0
vvc_alf_filter_chroma_16x4_10_c: 2410.0
vvc_alf_filter_chroma_16x4_10_avx2: 251.7
vvc_alf_filter_chroma_16x8_10_c: 4943.0
vvc_alf_filter_chroma_16x8_10_avx2: 479.0
vvc_alf_filter_chroma_16x12_10_c: 7235.5
vvc_alf_filter_chroma_16x12_10_avx2: 9751.0
vvc_alf_filter_chroma_16x16_10_c: 10142.7
vvc_alf_filter_chroma_16x16_10_avx2: 935.5
vvc_alf_filter_chroma_16x20_10_c: 12029.0
vvc_alf_filter_chroma_16x20_10_avx2: 1174.5
vvc_alf_filter_chroma_16x24_10_c: 14414.2
vvc_alf_filter_chroma_16x24_10_avx2: 1410.5
vvc_alf_filter_chroma_16x28_10_c: 16813.0
vvc_alf_filter_chroma_16x28_10_avx2: 1713.0
vvc_alf_filter_chroma_16x32_10_c: 19228.5
vvc_alf_filter_chroma_16x32_10_avx2: 2256.0
vvc_alf_filter_chroma_20x4_10_c: 3015.2
vvc_alf_filter_chroma_20x4_10_avx2: 371.7
vvc_alf_filter_chroma_20x8_10_c: 6170.2
vvc_alf_filter_chroma_20x8_10_avx2: 721.0
vvc_alf_filter_chroma_20x12_10_c: 9019.7
vvc_alf_filter_chroma_20x12_10_avx2: 1102.7
vvc_alf_filter_chroma_20x16_10_c: 12040.2
vvc_alf_filter_chroma_20x16_10_avx2: 1422.5
vvc_alf_filter_chroma_20x20_10_c: 15010.7
vvc_alf_filter_chroma_20x20_10_avx2: 1765.7
vvc_alf_filter_chroma_20x24_10_c: 18017.7
vvc_alf_filter_chroma_20x24_10_avx2: 2124.7
vvc_alf_filter_chroma_20x28_10_c: 21025.5
vvc_alf_filter_chroma_20x28_10_avx2: 2488.2
vvc_alf_filter_chroma_20x32_10_c: 31128.5
vvc_alf_filter_chroma_20x32_10_avx2: 3205.2
vvc_alf_filter_chroma_24x4_10_c: 3701.2
vvc_alf_filter_chroma_24x4_10_avx2: 494.7
vvc_alf_filter_chroma_24x8_10_c: 7613.0
vvc_alf_filter_chroma_24x8_10_avx2: 957.2
vvc_alf_filter_chroma_24x12_10_c: 10816.7
vvc_alf_filter_chroma_24x12_10_avx2: 1427.7
vvc_alf_filter_chroma_24x16_10_c: 14390.5
vvc_alf_filter_chroma_24x16_10_avx2: 1948.2
vvc_alf_filter_chroma_24x20_10_c: 17989.5
vvc_alf_filter_chroma_24x20_10_avx2: 2363.7
vvc_alf_filter_chroma_24x24_10_c: 21581.7
vvc_alf_filter_chroma_24x24_10_avx2: 2839.7
vvc_alf_filter_chroma_24x28_10_c: 25179.2
vvc_alf_filter_chroma_24x28_10_avx2: 3313.2
vvc_alf_filter_chroma_24x32_10_c: 28776.2
vvc_alf_filter_chroma_24x32_10_avx2: 4154.7
vvc_alf_filter_chroma_28x4_10_c: 4331.2
vvc_alf_filter_chroma_28x4_10_avx2: 624.2
vvc_alf_filter_chroma_28x8_10_c: 8445.0
vvc_alf_filter_chroma_28x8_10_avx2: 1197.7
vvc_alf_filter_chroma_28x12_10_c: 12684.5
vvc_alf_filter_chroma_28x12_10_avx2: 1786.7
vvc_alf_filter_chroma_28x16_10_c: 16924.5
vvc_alf_filter_chroma_28x16_10_avx2: 2378.7
vvc_alf_filter_chroma_28x20_10_c: 38361.0
vvc_alf_filter_chroma_28x20_10_avx2: 2967.0
vvc_alf_filter_chroma_28x24_10_c: 25329.0
vvc_alf_filter_chroma_28x24_10_avx2: 3564.2
vvc_alf_filter_chroma_28x28_10_c: 29514.0
vvc_alf_filter_chroma_28x28_10_avx2: 4151.7
vvc_alf_filter_chroma_28x32_10_c: 33673.2
vvc_alf_filter_chroma_28x32_10_avx2: 5125.0
vvc_alf_filter_chroma_32x4_10_c: 4945.2
vvc_alf_filter_chroma_32x4_10_avx2: 485.7
vvc_alf_filter_chroma_32x8_10_c: 9658.7
vvc_alf_filter_chroma_32x8_10_avx2: 943.7
vvc_alf_filter_chroma_32x12_10_c: 16177.7
vvc_alf_filter_chroma_32x12_10_avx2: 1443.7

[FFmpeg-devel] [PATCH v3 3/3] avcodec/x86/vvc/vvcdsp_init: fix linking error when configuring with --disable-ssse3 --disable-optimizations options

2024-04-17 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvcdsp_init.c | 46 +---
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index aef6699c35..985d750472 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -88,6 +88,7 @@ AVG_PROTOTYPES(10, avx2)
 AVG_PROTOTYPES(12, avx2)
 
 #if ARCH_X86_64
+#if HAVE_SSE4_EXTERNAL
 #define FW_PUT(name, depth, opt) \
 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride,\
  int height, const int8_t *hf, 
const int8_t *vf, int width)\
@@ -125,7 +126,9 @@ void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t 
*dst, const uint8_t *sr
 FW_PUT_SSE4( 8)
 FW_PUT_SSE4(10)
 FW_PUT_SSE4(12)
+#endif
 
+#if HAVE_AVX2_EXTERNAL
 #define FW_PUT_TAP_AVX2(n, bitd)\
 FW_PUT(n ## tap_h32,   bitd, avx2)  \
 FW_PUT(n ## tap_h64,   bitd, avx2)  \
@@ -161,6 +164,25 @@ FW_PUT_AVX2(12)
 FW_PUT_16BPC_AVX2(10)
 FW_PUT_16BPC_AVX2(12)
 
+#define AVG_FUNCS(bpc, bd, opt)
 \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
+const int16_t *src0, const int16_t *src1, int width, int height)   
 \
+{  
 \
+BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << 
bd)  - 1);   \
+}  
 \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
+const int16_t *src0, const int16_t *src1, int width, int height,   
 \
+int denom, int w0, int w1, int o0, int o1) 
 \
+{  
 \
+BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, 
 \
+denom, w0, w1, o0, o1, (1 << bd)  - 1);
 \
+}
+
+AVG_FUNCS(8,  8,  avx2)
+AVG_FUNCS(16, 10, avx2)
+AVG_FUNCS(16, 12, avx2)
+#endif
+
 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt)  
\
 dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt;
\
 dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## 
_##opt; \
@@ -226,27 +248,9 @@ FW_PUT_16BPC_AVX2(12)
 MC_TAP_LINKS_16BPC_AVX2(LUMA,   8, bd);  \
 MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
 
-#define AVG_FUNCS(bpc, bd, opt)
 \
-void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
-const int16_t *src0, const int16_t *src1, int width, int height)   
 \
-{  
 \
-BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << 
bd)  - 1);   \
-}  
 \
-void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
-const int16_t *src0, const int16_t *src1, int width, int height,   
 \
-int denom, int w0, int w1, int o0, int o1) 
 \
-{  
 \
-BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, 
 \
-denom, w0, w1, o0, o1, (1 << bd)  - 1);
 \
-}
-
-AVG_FUNCS(8,  8,  avx2)
-AVG_FUNCS(16, 10, avx2)
-AVG_FUNCS(16, 12, avx2)
-
-#define AVG_INIT(bd, opt) do {  \
-c->inter.avg= bf(ff_vvc_avg, bd, opt);  \
-c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);\
+#define AVG_INIT(bd, opt) do {   \
+c->inter.avg= bf(ff_vvc_avg, bd, opt);   \
+c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt); \
 } while (0)
 #endif
 
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 2/3] avcodec/x86/vvc/vvcdsp_init: add avg prototypes

2024-04-17 Thread toqsxw
From: Wu Jianhua 

When we used the --disable-ssse3 --disable-optimizations options,
the compiler would not skip the MC_LINKS like the compilation that
enabled the optimization, so it would fail to find the function
prototypes. Hence, this commit uses the same way to add prototypes
for the functions as HEVC DSP.

And, when prototypes are added for the functions, we cannot add the static 
qualifier.
Therefore, the ff_vvc prefix is needed to avoid the naming conflict.

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvcdsp_init.c | 45 
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index d9203f4d5f..aef6699c35 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -63,6 +63,30 @@ PUT_TAP_PROTOTYPES(8, sse4)
 PUT_TAP_PROTOTYPES(4, avx2)
 PUT_TAP_PROTOTYPES(8, avx2)
 
+#define bf(fn, bd,  opt) fn##_##bd##_##opt
+#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
+
+#define AVG_BPC_PROTOTYPES(bpc, opt)   
  \
+void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
  \
+const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
intptr_t pixel_max);  \
+void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,
  \
+const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
  \
+intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
+
+#define AVG_PROTOTYPES(bd, opt)
  \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
  \
+const int16_t *src0, const int16_t *src1, int width, int height);  
  \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
  \
+const int16_t *src0, const int16_t *src1, int width, int height,   
  \
+int denom, int w0, int w1, int o0, int o1);
+
+AVG_BPC_PROTOTYPES( 8, avx2)
+AVG_BPC_PROTOTYPES(16, avx2)
+
+AVG_PROTOTYPES( 8, avx2)
+AVG_PROTOTYPES(10, avx2)
+AVG_PROTOTYPES(12, avx2)
+
 #if ARCH_X86_64
 #define FW_PUT(name, depth, opt) \
 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride,\
@@ -202,23 +226,13 @@ FW_PUT_16BPC_AVX2(12)
 MC_TAP_LINKS_16BPC_AVX2(LUMA,   8, bd);  \
 MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
 
-#define bf(fn, bd,  opt) fn##_##bd##_##opt
-#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
-
-#define AVG_BPC_FUNC(bpc, opt) 
 \
-void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
 \
-const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
intptr_t pixel_max); \
-void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,
 \
-const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
 \
-intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
-
 #define AVG_FUNCS(bpc, bd, opt)
 \
-static void bf(avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
 const int16_t *src0, const int16_t *src1, int width, int height)   
 \
 {  
 \
 BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << 
bd)  - 1);   \
 }  
 \
-static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
 const int16_t *src0, const int16_t *src1, int width, int height,   
 \
 int denom, int w0, int w1, int o0, int o1) 
 \
 {  
 \
@@ -226,16 +240,13 @@ static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride,
 denom, w0, w1, o0, o1, (1 << bd)  - 1);
 \
 }
 
-AVG_BPC_FUNC(8,   avx2)
-AVG_BPC_FUNC(16,  avx2)
-
 AVG_FUNCS(8,  8,  avx2)
 AVG_FUNCS(16, 10, avx2)
 AVG_FUNCS(16, 12, avx2)
 
 #define 

[FFmpeg-devel] [PATCH v3 1/3] avcodec/x86/vvc/vvcdsp_init: add put prototypes

2024-04-17 Thread toqsxw
From: Wu Jianhua 

When we used the --disable-ssse3 --disable-optimizations options,
the compiler would not skip the MC_LINKS like the compilation that
enabled the optimization, so it would fail to find the function
prototypes. Hence, this commit uses the same way to add prototypes
for the functions as HEVC DSP.

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvcdsp_init.c | 35 +++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 23a3172c45..d9203f4d5f 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -30,9 +30,42 @@
 #include "libavcodec/vvc/dsp.h"
 #include "libavcodec/x86/h26x/h2656dsp.h"
 
+#define PUT_PROTOTYPE(name, depth, opt) \
+void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int 
width);
+
+#define PUT_PROTOTYPES(name, bitd, opt) \
+PUT_PROTOTYPE(name##2,   bitd, opt) \
+PUT_PROTOTYPE(name##4,   bitd, opt) \
+PUT_PROTOTYPE(name##8,   bitd, opt) \
+PUT_PROTOTYPE(name##12,  bitd, opt) \
+PUT_PROTOTYPE(name##16,  bitd, opt) \
+PUT_PROTOTYPE(name##24,  bitd, opt) \
+PUT_PROTOTYPE(name##32,  bitd, opt) \
+PUT_PROTOTYPE(name##48,  bitd, opt) \
+PUT_PROTOTYPE(name##64,  bitd, opt) \
+PUT_PROTOTYPE(name##128, bitd, opt)
+
+#define PUT_BPC_PROTOTYPES(name, opt) \
+PUT_PROTOTYPES(name,  8, opt) \
+PUT_PROTOTYPES(name, 10, opt) \
+PUT_PROTOTYPES(name, 12, opt)
+
+#define PUT_TAP_PROTOTYPES(n, opt) \
+PUT_BPC_PROTOTYPES(n##tap_h,  opt) \
+PUT_BPC_PROTOTYPES(n##tap_v,  opt) \
+PUT_BPC_PROTOTYPES(n##tap_hv, opt)
+
+PUT_BPC_PROTOTYPES(pixels, sse4)
+PUT_BPC_PROTOTYPES(pixels, avx2)
+
+PUT_TAP_PROTOTYPES(4, sse4)
+PUT_TAP_PROTOTYPES(8, sse4)
+PUT_TAP_PROTOTYPES(4, avx2)
+PUT_TAP_PROTOTYPES(8, avx2)
+
 #if ARCH_X86_64
 #define FW_PUT(name, depth, opt) \
-static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
+void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride,\
  int height, const int8_t *hf, 
const int8_t *vf, int width)\
 {  
\
 ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, 
srcstride, height, hf, vf, width); \
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 3/3] avcodec/x86/vvc/vvcdsp_init: fix linking error when configuring with --disable-ssse3 --disable-optimizations options

2024-04-15 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvcdsp_init.c | 46 +---
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index aef6699c35..985d750472 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -88,6 +88,7 @@ AVG_PROTOTYPES(10, avx2)
 AVG_PROTOTYPES(12, avx2)
 
 #if ARCH_X86_64
+#if HAVE_SSE4_EXTERNAL
 #define FW_PUT(name, depth, opt) \
 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride,\
  int height, const int8_t *hf, 
const int8_t *vf, int width)\
@@ -125,7 +126,9 @@ void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t 
*dst, const uint8_t *sr
 FW_PUT_SSE4( 8)
 FW_PUT_SSE4(10)
 FW_PUT_SSE4(12)
+#endif
 
+#if HAVE_AVX2_EXTERNAL
 #define FW_PUT_TAP_AVX2(n, bitd)\
 FW_PUT(n ## tap_h32,   bitd, avx2)  \
 FW_PUT(n ## tap_h64,   bitd, avx2)  \
@@ -161,6 +164,25 @@ FW_PUT_AVX2(12)
 FW_PUT_16BPC_AVX2(10)
 FW_PUT_16BPC_AVX2(12)
 
+#define AVG_FUNCS(bpc, bd, opt)
 \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
+const int16_t *src0, const int16_t *src1, int width, int height)   
 \
+{  
 \
+BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << 
bd)  - 1);   \
+}  
 \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
+const int16_t *src0, const int16_t *src1, int width, int height,   
 \
+int denom, int w0, int w1, int o0, int o1) 
 \
+{  
 \
+BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, 
 \
+denom, w0, w1, o0, o1, (1 << bd)  - 1);
 \
+}
+
+AVG_FUNCS(8,  8,  avx2)
+AVG_FUNCS(16, 10, avx2)
+AVG_FUNCS(16, 12, avx2)
+#endif
+
 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt)  
\
 dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt;
\
 dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## 
_##opt; \
@@ -226,27 +248,9 @@ FW_PUT_16BPC_AVX2(12)
 MC_TAP_LINKS_16BPC_AVX2(LUMA,   8, bd);  \
 MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
 
-#define AVG_FUNCS(bpc, bd, opt)
 \
-void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
-const int16_t *src0, const int16_t *src1, int width, int height)   
 \
-{  
 \
-BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << 
bd)  - 1);   \
-}  
 \
-void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
-const int16_t *src0, const int16_t *src1, int width, int height,   
 \
-int denom, int w0, int w1, int o0, int o1) 
 \
-{  
 \
-BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, 
 \
-denom, w0, w1, o0, o1, (1 << bd)  - 1);
 \
-}
-
-AVG_FUNCS(8,  8,  avx2)
-AVG_FUNCS(16, 10, avx2)
-AVG_FUNCS(16, 12, avx2)
-
-#define AVG_INIT(bd, opt) do {  \
-c->inter.avg= bf(ff_vvc_avg, bd, opt);  \
-c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);\
+#define AVG_INIT(bd, opt) do {   \
+c->inter.avg= bf(ff_vvc_avg, bd, opt);   \
+c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt); \
 } while (0)
 #endif
 
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 2/3] avcodec/x86/vvc/vvcdsp_init: add avg prototypes

2024-04-15 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvcdsp_init.c | 45 
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index d9203f4d5f..aef6699c35 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -63,6 +63,30 @@ PUT_TAP_PROTOTYPES(8, sse4)
 PUT_TAP_PROTOTYPES(4, avx2)
 PUT_TAP_PROTOTYPES(8, avx2)
 
+#define bf(fn, bd,  opt) fn##_##bd##_##opt
+#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
+
+#define AVG_BPC_PROTOTYPES(bpc, opt)   
  \
+void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
  \
+const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
intptr_t pixel_max);  \
+void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,
  \
+const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
  \
+intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
+
+#define AVG_PROTOTYPES(bd, opt)
  \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
  \
+const int16_t *src0, const int16_t *src1, int width, int height);  
  \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
  \
+const int16_t *src0, const int16_t *src1, int width, int height,   
  \
+int denom, int w0, int w1, int o0, int o1);
+
+AVG_BPC_PROTOTYPES( 8, avx2)
+AVG_BPC_PROTOTYPES(16, avx2)
+
+AVG_PROTOTYPES( 8, avx2)
+AVG_PROTOTYPES(10, avx2)
+AVG_PROTOTYPES(12, avx2)
+
 #if ARCH_X86_64
 #define FW_PUT(name, depth, opt) \
 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride,\
@@ -202,23 +226,13 @@ FW_PUT_16BPC_AVX2(12)
 MC_TAP_LINKS_16BPC_AVX2(LUMA,   8, bd);  \
 MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
 
-#define bf(fn, bd,  opt) fn##_##bd##_##opt
-#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
-
-#define AVG_BPC_FUNC(bpc, opt) 
 \
-void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
 \
-const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
intptr_t pixel_max); \
-void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,
 \
-const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
 \
-intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
-
 #define AVG_FUNCS(bpc, bd, opt)
 \
-static void bf(avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
 const int16_t *src0, const int16_t *src1, int width, int height)   
 \
 {  
 \
 BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << 
bd)  - 1);   \
 }  
 \
-static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
 const int16_t *src0, const int16_t *src1, int width, int height,   
 \
 int denom, int w0, int w1, int o0, int o1) 
 \
 {  
 \
@@ -226,16 +240,13 @@ static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride,
 denom, w0, w1, o0, o1, (1 << bd)  - 1);
 \
 }
 
-AVG_BPC_FUNC(8,   avx2)
-AVG_BPC_FUNC(16,  avx2)
-
 AVG_FUNCS(8,  8,  avx2)
 AVG_FUNCS(16, 10, avx2)
 AVG_FUNCS(16, 12, avx2)
 
 #define AVG_INIT(bd, opt) do {  \
-c->inter.avg= bf(avg, bd, opt); \
-c->inter.w_avg  = bf(w_avg, bd, opt);   \
+c->inter.avg= bf(ff_vvc_avg, bd, opt);  \
+c->inter.w_avg  = bf(ff_vvc_w_avg, bd, opt);\
 } while (0)
 #endif
 
-- 
2.44.0.windows.1

___

[FFmpeg-devel] [PATCH v2 1/3] avcodec/x86/vvc/vvcdsp_init: add put prototypes

2024-04-15 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvcdsp_init.c | 35 +++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 23a3172c45..d9203f4d5f 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -30,9 +30,42 @@
 #include "libavcodec/vvc/dsp.h"
 #include "libavcodec/x86/h26x/h2656dsp.h"
 
+#define PUT_PROTOTYPE(name, depth, opt) \
+void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int 
width);
+
+#define PUT_PROTOTYPES(name, bitd, opt) \
+PUT_PROTOTYPE(name##2,   bitd, opt) \
+PUT_PROTOTYPE(name##4,   bitd, opt) \
+PUT_PROTOTYPE(name##8,   bitd, opt) \
+PUT_PROTOTYPE(name##12,  bitd, opt) \
+PUT_PROTOTYPE(name##16,  bitd, opt) \
+PUT_PROTOTYPE(name##24,  bitd, opt) \
+PUT_PROTOTYPE(name##32,  bitd, opt) \
+PUT_PROTOTYPE(name##48,  bitd, opt) \
+PUT_PROTOTYPE(name##64,  bitd, opt) \
+PUT_PROTOTYPE(name##128, bitd, opt)
+
+#define PUT_BPC_PROTOTYPES(name, opt) \
+PUT_PROTOTYPES(name,  8, opt) \
+PUT_PROTOTYPES(name, 10, opt) \
+PUT_PROTOTYPES(name, 12, opt)
+
+#define PUT_TAP_PROTOTYPES(n, opt) \
+PUT_BPC_PROTOTYPES(n##tap_h,  opt) \
+PUT_BPC_PROTOTYPES(n##tap_v,  opt) \
+PUT_BPC_PROTOTYPES(n##tap_hv, opt)
+
+PUT_BPC_PROTOTYPES(pixels, sse4)
+PUT_BPC_PROTOTYPES(pixels, avx2)
+
+PUT_TAP_PROTOTYPES(4, sse4)
+PUT_TAP_PROTOTYPES(8, sse4)
+PUT_TAP_PROTOTYPES(4, avx2)
+PUT_TAP_PROTOTYPES(8, avx2)
+
 #if ARCH_X86_64
 #define FW_PUT(name, depth, opt) \
-static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
+void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride,\
  int height, const int8_t *hf, 
const int8_t *vf, int width)\
 {  
\
 ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, 
srcstride, height, hf, vf, width); \
-- 
2.44.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] avcodec/x86/vvc/vvcdsp_init: fix linking error when configuring with --disable-ssse3 --disable-optimizations options

2024-02-29 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvcdsp_init.c | 115 ++-
 1 file changed, 82 insertions(+), 33 deletions(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 0d2c683f0f..9ae84bda48 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -31,9 +31,67 @@
 #include "libavcodec/vvc/vvcdsp.h"
 #include "libavcodec/x86/h26x/h2656dsp.h"
 
+#define PUT_PROTOTYPE(name, depth, opt) \
+void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int 
width);
+
+#define PUT_PROTOTYPES(name, bitd, opt) \
+PUT_PROTOTYPE(name##2,   bitd, opt) \
+PUT_PROTOTYPE(name##4,   bitd, opt) \
+PUT_PROTOTYPE(name##8,   bitd, opt) \
+PUT_PROTOTYPE(name##12,  bitd, opt) \
+PUT_PROTOTYPE(name##16,  bitd, opt) \
+PUT_PROTOTYPE(name##24,  bitd, opt) \
+PUT_PROTOTYPE(name##32,  bitd, opt) \
+PUT_PROTOTYPE(name##48,  bitd, opt) \
+PUT_PROTOTYPE(name##64,  bitd, opt) \
+PUT_PROTOTYPE(name##128, bitd, opt)
+
+#define PUT_BPC_PROTOTYPES(name, opt) \
+PUT_PROTOTYPES(name,  8, opt) \
+PUT_PROTOTYPES(name, 10, opt) \
+PUT_PROTOTYPES(name, 12, opt)
+
+#define PUT_TAP_PROTOTYPES(n, opt) \
+PUT_BPC_PROTOTYPES(n##tap_h,  opt) \
+PUT_BPC_PROTOTYPES(n##tap_v,  opt) \
+PUT_BPC_PROTOTYPES(n##tap_hv, opt)
+
+PUT_BPC_PROTOTYPES(pixels, sse4)
+PUT_BPC_PROTOTYPES(pixels, avx2)
+
+PUT_TAP_PROTOTYPES(4, sse4)
+PUT_TAP_PROTOTYPES(8, sse4)
+PUT_TAP_PROTOTYPES(4, avx2)
+PUT_TAP_PROTOTYPES(8, avx2)
+
+#define bf(fn, bd,  opt) fn##_##bd##_##opt
+#define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
+
+#define AVG_BPC_PROTOTYPES(bpc, opt)   
  \
+void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
  \
+const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
intptr_t pixel_max);  \
+void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,
  \
+const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, 
  \
+intptr_t denom, intptr_t w0, intptr_t w1,  intptr_t o0, intptr_t o1, 
intptr_t pixel_max);
+
+#define AVG_PROTOTYPES(bd, opt)
  \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
  \
+const int16_t *src0, const int16_t *src1, int width, int height);  
  \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
  \
+const int16_t *src0, const int16_t *src1, int width, int height,   
  \
+int denom, int w0, int w1, int o0, int o1);
+
+AVG_BPC_PROTOTYPES( 8, avx2)
+AVG_BPC_PROTOTYPES(16, avx2)
+
+AVG_PROTOTYPES( 8, avx2)
+AVG_PROTOTYPES(10, avx2)
+AVG_PROTOTYPES(12, avx2)
+
 #if ARCH_X86_64
+#if HAVE_SSE4_EXTERNAL
 #define FW_PUT(name, depth, opt) \
-static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
+void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t 
*src, ptrdiff_t srcstride,\
  int height, const int8_t *hf, 
const int8_t *vf, int width)\
 {  
\
 ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, 
srcstride, height, hf, vf, width); \
@@ -69,7 +127,9 @@ static void ff_vvc_put_ ## name ## _ ## depth ## 
_##opt(int16_t *dst, const uint
 FW_PUT_SSE4( 8)
 FW_PUT_SSE4(10)
 FW_PUT_SSE4(12)
+#endif
 
+#if HAVE_AVX2_EXTERNAL
 #define FW_PUT_TAP_AVX2(n, bitd)\
 FW_PUT(n ## tap_h32,   bitd, avx2)  \
 FW_PUT(n ## tap_h64,   bitd, avx2)  \
@@ -105,6 +165,25 @@ FW_PUT_AVX2(12)
 FW_PUT_16BPC_AVX2(10)
 FW_PUT_16BPC_AVX2(12)
 
+#define AVG_FUNCS(bpc, bd, opt)
 \
+void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
 \
+const int16_t *src0, const int16_t *src1, int width, int height)   
 \
+{  
 \
+BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << 
bd)  - 1);   \
+}  
 \
+void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
 \
+const int16_t *src0, const int16_t *src1, int width, int height, 

[FFmpeg-devel] [PATCH] avcodec/x86/vvc/vvcdsp_init: fix unresolved external symbol on ARCH_X86_32

2024-02-05 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/vvc/vvcdsp_init.c | 78 
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index 909ef9f56b..8ee4074350 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -31,6 +31,7 @@
 #include "libavcodec/vvc/vvcdsp.h"
 #include "libavcodec/x86/h26x/h2656dsp.h"
 
+#if ARCH_X86_64
 #define FW_PUT(name, depth, opt) \
 static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
  int height, const int8_t *hf, 
const int8_t *vf, int width)\
@@ -204,51 +205,52 @@ AVG_FUNCS(16, 12, avx2)
 c->inter.avg= bf(avg, bd, opt); \
 c->inter.w_avg  = bf(w_avg, bd, opt);   \
 } while (0)
+#endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
 {
+#if ARCH_X86_64
 const int cpu_flags = av_get_cpu_flags();
 
-if (ARCH_X86_64) {
-if (bd == 8) {
-if (EXTERNAL_SSE4(cpu_flags)) {
-MC_LINK_SSE4(8);
-}
-if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-MC_LINKS_AVX2(8);
-}
-} else if (bd == 10) {
-if (EXTERNAL_SSE4(cpu_flags)) {
-MC_LINK_SSE4(10);
-}
-if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-MC_LINKS_AVX2(10);
-MC_LINKS_16BPC_AVX2(10);
-}
-} else if (bd == 12) {
-if (EXTERNAL_SSE4(cpu_flags)) {
-MC_LINK_SSE4(12);
-}
-if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-MC_LINKS_AVX2(12);
-MC_LINKS_16BPC_AVX2(12);
-}
+if (bd == 8) {
+if (EXTERNAL_SSE4(cpu_flags)) {
+MC_LINK_SSE4(8);
 }
+if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+MC_LINKS_AVX2(8);
+}
+} else if (bd == 10) {
+if (EXTERNAL_SSE4(cpu_flags)) {
+MC_LINK_SSE4(10);
+}
+if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+MC_LINKS_AVX2(10);
+MC_LINKS_16BPC_AVX2(10);
+}
+} else if (bd == 12) {
+if (EXTERNAL_SSE4(cpu_flags)) {
+MC_LINK_SSE4(12);
+}
+if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+MC_LINKS_AVX2(12);
+MC_LINKS_16BPC_AVX2(12);
+}
+}
 
-if (EXTERNAL_AVX2(cpu_flags)) {
-switch (bd) {
-case 8:
-AVG_INIT(8, avx2);
-break;
-case 10:
-AVG_INIT(10, avx2);
-break;
-case 12:
-AVG_INIT(12, avx2);
-break;
-default:
-break;
-}
+if (EXTERNAL_AVX2(cpu_flags)) {
+switch (bd) {
+case 8:
+AVG_INIT(8, avx2);
+break;
+case 10:
+AVG_INIT(10, avx2);
+break;
+case 12:
+AVG_INIT(12, avx2);
+break;
+default:
+break;
 }
 }
+#endif
 }
-- 
2.43.0.windows.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 6/8] tests/checkasm: add checkasm_check_vvc_mc

2024-01-23 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_mc.c   | 270 ++
 4 files changed, 275 insertions(+)
 create mode 100644 tests/checkasm/vvc_mc.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3b5b54352b..3562acb2b2 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_mc.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 87f24c77ca..36a97957e5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -194,6 +194,9 @@ static const struct {
 #if CONFIG_VORBIS_DECODER
 { "vorbisdsp", checkasm_check_vorbisdsp },
 #endif
+#if CONFIG_VVC_DECODER
+{ "vvc_mc", checkasm_check_vvc_mc },
+#endif
 #endif
 #if CONFIG_AVFILTER
 #if CONFIG_AFIR_FILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4db8c495ea..53cb3ccfbf 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_mc(void);
 
 struct CheckasmPerf;
 
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
new file mode 100644
index 00..711280deec
--- /dev/null
+++ b/tests/checkasm/vvc_mc.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvc_data.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
+static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
+
+#define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
+#define EXTRA_BEFORE 3
+#define EXTRA_AFTER  4
+#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2
+#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA)
+#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2)
+#define SRC_OFFSET   ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE)
+
+#define randomize_buffers(buf0, buf1, size, mask)   \
+do {\
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+#define randomize_pixels(buf0, buf1, size)  \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+#define randomize_avg_src(buf0, buf1, size) \
+do {\
+uint32_t mask = 0x3fff3fff; \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+static void check_put_vvc_luma(void)
+{
+LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+VVCDSPContext c;
+
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t 
*dst, const uint8_t *src, const ptrdiff_t src_stride,
+  

[FFmpeg-devel] [PATCH v4 8/8] tests/checkasm/vvc_mc: add check_avg

2024-01-23 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_mc.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 711280deec..8adb00573f 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -35,6 +35,7 @@
 static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
 static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
 
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
 #define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
 #define EXTRA_BEFORE 3
 #define EXTRA_AFTER  4
@@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void)
 report("put_uni_chroma");
 }
 
+#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE)
+#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2)
+
+static void check_avg(void)
+{
+LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]);
+VVCDSPContext c;
+
+for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+ff_vvc_dsp_init(, bit_depth);
+for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) {
+for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) {
+{
+   declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height);
+if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, 
h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h);
+}
+}
+{
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height,
+int denom, int w0, int w1, int o0, int o1);
+{
+const int denom = rnd() % 8;
+const int w0= rnd() % 256 - 128;
+const int w1= rnd() % 256 - 128;
+const int o0= rnd() % 256 - 128;
+const int o1= rnd() % 256 - 128;
+if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", 
bit_depth, w, h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h, denom, w0, w1, o0, o1);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h, denom, w0, w1, o0, o1);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h, denom, w0, w1, o0, o1);
+}
+}
+}
+}
+}
+}
+report("avg");
+}
+
 void checkasm_check_vvc_mc(void)
 {
 check_put_vvc_luma();
 check_put_vvc_luma_uni();
 check_put_vvc_chroma();
 check_put_vvc_chroma_uni();
+check_avg();
 }
-- 
2.34.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v4 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations

2024-01-23 Thread toqsxw
From: Wu Jianhua 

The avg/avg_w is based on dav1d.
See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm

vvc_avg_8_2x2_c: 71.6
vvc_avg_8_2x2_avx2: 26.8
vvc_avg_8_2x4_c: 140.8
vvc_avg_8_2x4_avx2: 34.6
vvc_avg_8_2x8_c: 410.3
vvc_avg_8_2x8_avx2: 41.3
vvc_avg_8_2x16_c: 769.3
vvc_avg_8_2x16_avx2: 60.3
vvc_avg_8_2x32_c: 1669.6
vvc_avg_8_2x32_avx2: 105.1
vvc_avg_8_2x64_c: 1978.3
vvc_avg_8_2x64_avx2: 425.8
vvc_avg_8_2x128_c: 6536.8
vvc_avg_8_2x128_avx2: 1315.1
vvc_avg_8_4x2_c: 155.6
vvc_avg_8_4x2_avx2: 26.1
vvc_avg_8_4x4_c: 250.3
vvc_avg_8_4x4_avx2: 31.3
vvc_avg_8_4x8_c: 831.8
vvc_avg_8_4x8_avx2: 41.3
vvc_avg_8_4x16_c: 1461.1
vvc_avg_8_4x16_avx2: 57.1
vvc_avg_8_4x32_c: 2821.6
vvc_avg_8_4x32_avx2: 105.1
vvc_avg_8_4x64_c: 3615.8
vvc_avg_8_4x64_avx2: 412.6
vvc_avg_8_4x128_c: 11962.6
vvc_avg_8_4x128_avx2: 1274.3
vvc_avg_8_8x2_c: 215.8
vvc_avg_8_8x2_avx2: 29.1
vvc_avg_8_8x4_c: 430.6
vvc_avg_8_8x4_avx2: 37.6
vvc_avg_8_8x8_c: 1463.3
vvc_avg_8_8x8_avx2: 51.8
vvc_avg_8_8x16_c: 2630.1
vvc_avg_8_8x16_avx2: 97.6
vvc_avg_8_8x32_c: 5813.8
vvc_avg_8_8x32_avx2: 196.6
vvc_avg_8_8x64_c: 6687.3
vvc_avg_8_8x64_avx2: 487.8
vvc_avg_8_8x128_c: 13178.6
vvc_avg_8_8x128_avx2: 1290.6
vvc_avg_8_16x2_c: 443.8
vvc_avg_8_16x2_avx2: 28.3
vvc_avg_8_16x4_c: 1253.3
vvc_avg_8_16x4_avx2: 32.1
vvc_avg_8_16x8_c: 2236.3
vvc_avg_8_16x8_avx2: 44.3
vvc_avg_8_16x16_c: 5127.8
vvc_avg_8_16x16_avx2: 63.3
vvc_avg_8_16x32_c: 6573.3
vvc_avg_8_16x32_avx2: 223.6
vvc_avg_8_16x64_c: 30311.8
vvc_avg_8_16x64_avx2: 437.8
vvc_avg_8_16x128_c: 25693.3
vvc_avg_8_16x128_avx2: 1266.8
vvc_avg_8_32x2_c: 954.6
vvc_avg_8_32x2_avx2: 32.1
vvc_avg_8_32x4_c: 2359.6
vvc_avg_8_32x4_avx2: 39.6
vvc_avg_8_32x8_c: 5703.6
vvc_avg_8_32x8_avx2: 57.1
vvc_avg_8_32x16_c: 9967.6
vvc_avg_8_32x16_avx2: 107.1
vvc_avg_8_32x32_c: 21327.6
vvc_avg_8_32x32_avx2: 272.6
vvc_avg_8_32x64_c: 39240.8
vvc_avg_8_32x64_avx2: 529.6
vvc_avg_8_32x128_c: 52580.8
vvc_avg_8_32x128_avx2: 1338.8
vvc_avg_8_64x2_c: 1647.3
vvc_avg_8_64x2_avx2: 38.8
vvc_avg_8_64x4_c: 5130.1
vvc_avg_8_64x4_avx2: 58.8
vvc_avg_8_64x8_c: 6529.3
vvc_avg_8_64x8_avx2: 88.3
vvc_avg_8_64x16_c: 19913.6
vvc_avg_8_64x16_avx2: 162.3
vvc_avg_8_64x32_c: 39360.8
vvc_avg_8_64x32_avx2: 295.8
vvc_avg_8_64x64_c: 49658.3
vvc_avg_8_64x64_avx2: 784.1
vvc_avg_8_64x128_c: 108513.1
vvc_avg_8_64x128_avx2: 1977.1
vvc_avg_8_128x2_c: 3226.1
vvc_avg_8_128x2_avx2: 61.1
vvc_avg_8_128x4_c: 10280.3
vvc_avg_8_128x4_avx2: 94.6
vvc_avg_8_128x8_c: 18079.3
vvc_avg_8_128x8_avx2: 155.3
vvc_avg_8_128x16_c: 45121.8
vvc_avg_8_128x16_avx2: 285.3
vvc_avg_8_128x32_c: 48651.8
vvc_avg_8_128x32_avx2: 581.6
vvc_avg_8_128x64_c: 165078.6
vvc_avg_8_128x64_avx2: 1942.8
vvc_avg_8_128x128_c: 339103.1
vvc_avg_8_128x128_avx2: 4332.6
vvc_avg_10_2x2_c: 144.3
vvc_avg_10_2x2_avx2: 26.8
vvc_avg_10_2x4_c: 142.6
vvc_avg_10_2x4_avx2: 45.3
vvc_avg_10_2x8_c: 478.1
vvc_avg_10_2x8_avx2: 38.1
vvc_avg_10_2x16_c: 518.3
vvc_avg_10_2x16_avx2: 58.1
vvc_avg_10_2x32_c: 2059.8
vvc_avg_10_2x32_avx2: 93.1
vvc_avg_10_2x64_c: 2383.8
vvc_avg_10_2x64_avx2: 714.8
vvc_avg_10_2x128_c: 4498.3
vvc_avg_10_2x128_avx2: 1466.3
vvc_avg_10_4x2_c: 228.6
vvc_avg_10_4x2_avx2: 26.8
vvc_avg_10_4x4_c: 378.3
vvc_avg_10_4x4_avx2: 30.6
vvc_avg_10_4x8_c: 866.8
vvc_avg_10_4x8_avx2: 44.6
vvc_avg_10_4x16_c: 1018.1
vvc_avg_10_4x16_avx2: 58.1
vvc_avg_10_4x32_c: 3590.8
vvc_avg_10_4x32_avx2: 128.8
vvc_avg_10_4x64_c: 4200.8
vvc_avg_10_4x64_avx2: 663.6
vvc_avg_10_4x128_c: 8450.8
vvc_avg_10_4x128_avx2: 1531.8
vvc_avg_10_8x2_c: 369.3
vvc_avg_10_8x2_avx2: 28.3
vvc_avg_10_8x4_c: 513.8
vvc_avg_10_8x4_avx2: 32.1
vvc_avg_10_8x8_c: 1720.3
vvc_avg_10_8x8_avx2: 49.1
vvc_avg_10_8x16_c: 1894.8
vvc_avg_10_8x16_avx2: 71.6
vvc_avg_10_8x32_c: 3931.3
vvc_avg_10_8x32_avx2: 148.1
vvc_avg_10_8x64_c: 7964.3
vvc_avg_10_8x64_avx2: 613.1
vvc_avg_10_8x128_c: 15540.1
vvc_avg_10_8x128_avx2: 1585.1
vvc_avg_10_16x2_c: 877.3
vvc_avg_10_16x2_avx2: 27.6
vvc_avg_10_16x4_c: 955.8
vvc_avg_10_16x4_avx2: 29.8
vvc_avg_10_16x8_c: 3419.6
vvc_avg_10_16x8_avx2: 62.6
vvc_avg_10_16x16_c: 3826.8
vvc_avg_10_16x16_avx2: 54.3
vvc_avg_10_16x32_c: 7655.3
vvc_avg_10_16x32_avx2: 86.3
vvc_avg_10_16x64_c: 30011.1
vvc_avg_10_16x64_avx2: 692.6
vvc_avg_10_16x128_c: 47894.8
vvc_avg_10_16x128_avx2: 1580.3
vvc_avg_10_32x2_c: 944.3
vvc_avg_10_32x2_avx2: 29.8
vvc_avg_10_32x4_c: 2022.6
vvc_avg_10_32x4_avx2: 35.1
vvc_avg_10_32x8_c: 6148.8
vvc_avg_10_32x8_avx2: 51.3
vvc_avg_10_32x16_c: 12601.6
vvc_avg_10_32x16_avx2: 70.8
vvc_avg_10_32x32_c: 15958.6
vvc_avg_10_32x32_avx2: 124.3
vvc_avg_10_32x64_c: 31784.6
vvc_avg_10_32x64_avx2: 757.3
vvc_avg_10_32x128_c: 63892.8
vvc_avg_10_32x128_avx2: 1711.3
vvc_avg_10_64x2_c: 1890.8
vvc_avg_10_64x2_avx2: 34.3
vvc_avg_10_64x4_c: 6267.3
vvc_avg_10_64x4_avx2: 42.6
vvc_avg_10_64x8_c: 12778.1
vvc_avg_10_64x8_avx2: 67.8
vvc_avg_10_64x16_c: 22304.3
vvc_avg_10_64x16_avx2: 116.8
vvc_avg_10_64x32_c: 30777.1
vvc_avg_10_64x32_avx2: 201.1
vvc_avg_10_64x64_c: 60169.1
vvc_avg_10_64x64_avx2: 1454.3
vvc_avg_10_64x128_c: 124392.8
vvc_avg_10_64x128_avx2: 3648.6

[FFmpeg-devel] [PATCH v4 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations

2024-01-23 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/Makefile  |   1 +
 libavcodec/vvc/vvcdsp.c  |   4 +
 libavcodec/vvc/vvcdsp.h  |   2 +
 libavcodec/x86/vvc/Makefile  |   6 +
 libavcodec/x86/vvc/vvcdsp_init.c | 202 +++
 5 files changed, 215 insertions(+)
 create mode 100644 libavcodec/x86/vvc/Makefile
 create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb42095165..ce33631b60 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -65,6 +65,7 @@ OBJS = ac3_parser.o   
  \
 
 # subsystems
 include $(SRC_PATH)/libavcodec/vvc/Makefile
+include $(SRC_PATH)/libavcodec/x86/vvc/Makefile
 OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o
 OBJS-$(CONFIG_AC3DSP)  += ac3dsp.o ac3.o ac3tab.o
 OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o 
mpeg4audio_sample_rates.o
diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c
index c82ea7be30..c542be5258 100644
--- a/libavcodec/vvc/vvcdsp.c
+++ b/libavcodec/vvc/vvcdsp.c
@@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
 VVC_DSP(8);
 break;
 }
+
+#if ARCH_X86
+ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
+#endif
 }
diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h
index b5a63c5833..6f59e73654 100644
--- a/libavcodec/vvc/vvcdsp.h
+++ b/libavcodec/vvc/vvcdsp.h
@@ -167,4 +167,6 @@ typedef struct VVCDSPContext {
 
 void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
 
+void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
+
 #endif /* AVCODEC_VVC_VVCDSP_H */
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
new file mode 100644
index 00..b4acc22501
--- /dev/null
+++ b/libavcodec/x86/vvc/Makefile
@@ -0,0 +1,6 @@
+clean::
+   $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
+
+OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
+X86ASM-OBJS-$(CONFIG_VVC_DECODER)  += x86/h26x/h2656dsp.o   \
+   
  x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
new file mode 100644
index 00..c197cdb4cc
--- /dev/null
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -0,0 +1,202 @@
+/*
+ * VVC DSP init for x86
+ *
+ * Copyright (C) 2022-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vvc/vvcdec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvcdsp.h"
+#include "libavcodec/x86/h26x/h2656dsp.h"
+
+#define FW_PUT(name, depth, opt) \
+static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
+ int height, const int8_t *hf, 
const int8_t *vf, int width)\
+{  
\
+ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, 
srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_TAP(fname, bitd, opt ) \
+FW_PUT(fname##4,   bitd, opt );   \
+FW_PUT(fname##8,   bitd, opt );   \
+FW_PUT(fname##16,  bitd, opt );   \
+FW_PUT(fname##32,  bitd, opt );   \
+FW_PUT(fname##64,  bitd, opt );   \
+FW_PUT(fname##128, bitd, opt );   \
+
+#define FW_PUT_4TAP(fname, bitd, opt) \
+FW_PUT(fname ## 2, bitd, opt) \
+FW_PUT_TAP(fname,  bitd, opt)
+
+#define FW_PUT_4TAP_SSE4(bitd)   \
+FW_PUT_4TAP(pixels,  bitd, sse4) \
+FW_PUT_4TAP(4tap_h,  bitd, sse4) \
+FW_PUT_4TAP(4tap_v,  bitd, sse4) \
+FW_PUT_4TAP(4tap_hv, bitd, sse4)
+
+#define FW_PUT_8TAP_SSE4(bitd)  \
+FW_PUT_TAP(8tap_h,  bitd, sse4) \
+FW_PUT_TAP(8tap_v,  bitd, sse4) \
+FW_PUT_TAP(8tap_hv, bitd, sse4)
+
+#define FW_PUT_SSE4(bitd)  \
+FW_PUT_4TAP_SSE4(bitd) \
+FW_PUT_8TAP_SSE4(bitd)
+
+FW_PUT_SSE4( 8);
+FW_PUT_SSE4(10);
+FW_PUT_SSE4(12);
+
+#define 

[FFmpeg-devel] [PATCH v4 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm

2024-01-23 Thread toqsxw
From: Wu Jianhua 

This enable that the asm optimization can be reused by VVC

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/Makefile |1 +
 libavcodec/x86/h26x/h2656_inter.asm | 1145 +++
 libavcodec/x86/h26x/h2656dsp.c  |   98 +++
 libavcodec/x86/h26x/h2656dsp.h  |  103 +++
 libavcodec/x86/hevc_mc.asm  |  462 +--
 libavcodec/x86/hevcdsp_init.c   |  108 ++-
 6 files changed, 1471 insertions(+), 446 deletions(-)
 create mode 100644 libavcodec/x86/h26x/h2656_inter.asm
 create mode 100644 libavcodec/x86/h26x/h2656dsp.c
 create mode 100644 libavcodec/x86/h26x/h2656dsp.h

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d5fb30645a..8098cd840c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += 
x86/hevc_add_res.o\
   x86/hevc_deblock.o\
   x86/hevc_idct.o   \
   x86/hevc_mc.o \
+  x86/h26x/h2656_inter.o\
   x86/hevc_sao.o\
   x86/hevc_sao_10bit.o
 X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
new file mode 100644
index 00..aa296d549c
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -0,0 +1,1145 @@
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 64
+
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+cextern pw_8192
+%define scale_8 pw_512
+%define scale_10 pw_2048
+%define scale_12 pw_8192
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+max_pixels_12:  times 16 dw ((1 << 12)-1)
+cextern pb_0
+
+SECTION .text
+%macro SIMPLE_LOAD 4;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+movd  %4, [%3]   ; 
load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+movq  %4, [%3]   ; 
load data from source
+%elif notcpuflag(avx)
+movu  %4, [%3]   ; 
load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+movdqu   %4, [%3]
+%else
+movu  %4, [%3]
+%endif
+%endmacro
+
+%macro VPBROADCASTW 2
+%if notcpuflag(avx2)
+movd   %1, %2
+pshuflw%1, %1, 0
+punpcklwd  %1, %1
+%else
+vpbroadcastw   %1, %2
+%endif
+%endmacro
+
+%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b,
+VPBROADCASTW   %3, [%2q + 0 * 2]  ; coeff 0, 1
+VPBROADCASTW   %4, [%2q + 1 * 2]  ; coeff 2, 3
+%if %1 != 8
+pmovsxbw   %3, xmm%3
+pmovsxbw   %4, xmm%4
+%endif
+%endmacro
+
+%macro MC_4TAP_HV_FILTER 1
+VPBROADCASTW  m12, [vfq + 0 * 2]  ; vf 0, 1
+VPBROADCASTW  m13, [vfq + 1 * 2]  ; vf 2, 3
+VPBROADCASTW  m14, [hfq + 0 * 2]  ; hf 0, 1
+VPBROADCASTW  m15, [hfq + 1 * 2]  ; hf 2, 3
+
+pmovsxbw  m12, xm12
+pmovsxbw  m13, xm13
+%if %1 != 8
+pmovsxbw  m14, xm14
+pmovsxbw  m15, xm15
+%endif
+lea   r3srcq, [srcstrideq*3]
+%endmacro
+
+%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers
+mova [rsp + %1 + 0*mmsize], %2
+mova [rsp + %1 + 1*mmsize], %3
+mova [rsp + %1 + 2*mmsize], %4
+mova [rsp + %1 + 3*mmsize], %5
+%endmacro
+
+%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset
+VPBROADCASTW  m12, [%2q + 0 * 2]  ; coeff 0, 1
+VPBROADCASTW  m13, [%2q + 1 * 2]  ; coeff 2, 3
+VPBROADCASTW  m14, [%2q + 2 * 2]  ; coeff 4, 5
+VPBROADCASTW  m15, [%2q + 3 * 2] 

[FFmpeg-devel] [PATCH v4 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template

2024-01-23 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/hevcdsp_template.c | 594 +++---
 1 file changed, 46 insertions(+), 548 deletions(-)

diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 0de14e9dcf..9b48bdf08e 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -26,6 +26,7 @@
 #include "bit_depth_template.c"
 #include "hevcdsp.h"
 #include "h26x/h2656_sao_template.c"
+#include "h26x/h2656_inter_template.c"
 
 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int 
height,
   GetBitContext *gb, int pcm_bit_depth)
@@ -299,37 +300,51 @@ IDCT_DC(32)
 

 //
 

-static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-  const uint8_t *_src, ptrdiff_t 
_srcstride,
-  int height, intptr_t mx, intptr_t my, 
int width)
-{
-int x, y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-for (x = 0; x < width; x++)
-dst[x] = src[x] << (14 - BIT_DEPTH);
-src += srcstride;
-dst += MAX_PB_SIZE;
-}
-}
-
-static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, 
const uint8_t *_src, ptrdiff_t _srcstride,
-  int height, intptr_t mx, intptr_t 
my, int width)
-{
-int y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-pixel *dst  = (pixel *)_dst;
-ptrdiff_t dststride = _dststride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-memcpy(dst, src, width * sizeof(pixel));
-src += srcstride;
-dst += dststride;
-}
-}
+#define ff_hevc_pel_filters ff_hevc_qpel_filters
+#define DECL_HV_FILTER(f)  \
+const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
+const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
+
+#define FW_PUT(p, f, t)
   \
+static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t 
srcstride, int height,\
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width);   
   \
+}
+
+#define FW_PUT_UNI(p, f, t)
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride, int height, intptr_t 
mx, intptr_t my, int width)   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width);
   \
+}
+
+#define FW_PUT_UNI_W(p, f, t)  
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride,int height, int denom, 
int wx, int ox,  \
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, 
vf, width);\
+}
+
+#define FW_PUT_FUNCS(f, t, dir)   \
+FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \
+FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\
+FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir)
+
+FW_PUT(pel, pel_pixels, pixels)
+FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels)
+FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels)
+
+FW_PUT_FUNCS(qpel, luma,   h )
+FW_PUT_FUNCS(qpel, luma,   v )
+FW_PUT_FUNCS(qpel, luma,   hv)
+FW_PUT_FUNCS(epel, chroma, h )
+FW_PUT_FUNCS(epel, chroma, v )
+FW_PUT_FUNCS(epel, chroma, hv)
 
 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t 

[FFmpeg-devel] [PATCH v4 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put

2024-01-23 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/h26x/h2656_inter.asm | 32 ++---
 libavcodec/x86/h26x/h2656dsp.c  |  4 ++--
 libavcodec/x86/h26x/h2656dsp.h  |  2 +-
 libavcodec/x86/hevcdsp_init.c   |  2 +-
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index aa296d549c..cbba0c1ea5 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -22,8 +22,6 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 
-%define MAX_PB_SIZE 64
-
 SECTION_RODATA 32
 cextern pw_255
 cextern pw_512
@@ -342,7 +340,7 @@ SECTION .text
 %endmacro
 
 %macro LOOP_END 3
-add  %1q, 2*MAX_PB_SIZE  ; dst += dststride
+add  %1q, dststrideq ; dst += dststride
 add  %2q, %3q; src += srcstride
 dec  heightd ; cmp height
 jnz   .loop  ; height loop
@@ -539,7 +537,7 @@ SECTION .text
 
 
 ; **
-; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ; int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
@@ -549,7 +547,7 @@ SECTION .text
 %endmacro
 
 %macro MC_PIXELS 3
-cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height
+cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height
 pxor  m2, m2
 .loop:
 SIMPLE_LOAD   %2, %3, srcq, m0
@@ -579,10 +577,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, 
src, srcstride, height
 %endif
 
 ; **
-; void %1_put_4tap_hX(int16_t *dst,
+; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width);
 ; **
-cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf
+cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, 
height, hf
 %assign %%stride ((%3 + 7)/8)
 MC_4TAP_FILTER   %3, hf, m4, m5
 .loop:
@@ -612,10 +610,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 RET
 
 ; **
-; void %1_put_4tap_v(int16_t *dst,
+; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, 
r3src, vf
+cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, 
height, r3src, vf
 sub srcq, srcstrideq
 MC_4TAP_FILTER%3, vf, m4, m5
 lea   r3srcq, [srcstrideq*3]
@@ -649,10 +647,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 
 %macro PUT_4TAP_HV 3
 ; **
-; void put_4tap_hv(int16_t *dst,
+; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, 
r3src
+cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, 
height, hf, vf, r3src
 %assign %%stride ((%3 + 7)/8)
 sub srcq, srcstrideq
 MC_4TAP_HV_FILTER%3
@@ -784,12 +782,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, 
dststride, src, srcstride, heig
 %endmacro
 
 ; **
-; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;   int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
 %macro PUT_8TAP 3
-cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf
+cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, 
hf
 MC_8TAP_FILTER  %3, hf
 .loop:
 MC_8TAP_H_LOAD  %3, srcq, %2, 10
@@ -824,10 +822,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, 
dststride, src, srcstride, heigh
 
 
 ; **
-; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;  int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
-cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf
+cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height, 

[FFmpeg-devel] [PATCH v4 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c

2024-01-23 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/h26x/h2656_inter_template.c | 577 +
 libavcodec/vvc/vvc_inter_template.c| 559 +---
 2 files changed, 578 insertions(+), 558 deletions(-)
 create mode 100644 libavcodec/h26x/h2656_inter_template.c

diff --git a/libavcodec/h26x/h2656_inter_template.c 
b/libavcodec/h26x/h2656_inter_template.c
new file mode 100644
index 00..864f6c7e7d
--- /dev/null
+++ b/libavcodec/h26x/h2656_inter_template.c
@@ -0,0 +1,577 @@
+/*
+ * inter prediction template for HEVC/VVC
+ *
+ * Copyright (C) 2022 Nuo Mi
+ * Copyright (C) 2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CHROMA_EXTRA_BEFORE 1
+#define CHROMA_EXTRA3
+#define LUMA_EXTRA_BEFORE   3
+#define LUMA_EXTRA  7
+
+static void FUNC(put_pixels)(int16_t *dst,
+const uint8_t *_src, const ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = src[x] << (14 - BIT_DEPTH);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+ const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+memcpy(dst, src, width * sizeof(pixel));
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+const int denom, const int wx, const int _ox,  const int8_t *hf, const 
int8_t *vf,
+const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+const int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+const int offset= 1 << (shift - 1);
+#else
+const int offset= 0;
+#endif
+const int ox= _ox * (1 << (BIT_DEPTH - 8));
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++) {
+const int v = (src[x] << (14 - BIT_DEPTH));
+dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox);
+}
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+#define LUMA_FILTER(src, stride)   
\
+(filter[0] * src[x - 3 * stride] + 
\
+ filter[1] * src[x - 2 * stride] + 
\
+ filter[2] * src[x - stride] + 
\
+ filter[3] * src[x ] + 
\
+ filter[4] * src[x + stride] + 
\
+ filter[5] * src[x + 2 * stride] + 
\
+ filter[6] * src[x + 3 * stride] + 
\
+ filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src   = (const pixel*)_src;
+const ptrdiff_t src_stride = _src_stride / sizeof(pixel);
+const int8_t *filter   = hf;
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void 

[FFmpeg-devel] [PATCH v3 8/8] tests/checkasm/vvc_mc: add check_avg

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_mc.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 711280deec..8adb00573f 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -35,6 +35,7 @@
 static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
 static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
 
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
 #define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
 #define EXTRA_BEFORE 3
 #define EXTRA_AFTER  4
@@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void)
 report("put_uni_chroma");
 }
 
+#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE)
+#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2)
+
+static void check_avg(void)
+{
+LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]);
+VVCDSPContext c;
+
+for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+ff_vvc_dsp_init(, bit_depth);
+for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) {
+for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) {
+{
+   declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height);
+if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, 
h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h);
+}
+}
+{
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height,
+int denom, int w0, int w1, int o0, int o1);
+{
+const int denom = rnd() % 8;
+const int w0= rnd() % 256 - 128;
+const int w1= rnd() % 256 - 128;
+const int o0= rnd() % 256 - 128;
+const int o1= rnd() % 256 - 128;
+if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", 
bit_depth, w, h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h, denom, w0, w1, o0, o1);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h, denom, w0, w1, o0, o1);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h, denom, w0, w1, o0, o1);
+}
+}
+}
+}
+}
+}
+report("avg");
+}
+
 void checkasm_check_vvc_mc(void)
 {
 check_put_vvc_luma();
 check_put_vvc_luma_uni();
 check_put_vvc_chroma();
 check_put_vvc_chroma_uni();
+check_avg();
 }
-- 
2.34.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/Makefile  |   1 +
 libavcodec/vvc/vvcdsp.c  |   4 +
 libavcodec/vvc/vvcdsp.h  |   2 +
 libavcodec/x86/vvc/Makefile  |   6 +
 libavcodec/x86/vvc/vvcdsp_init.c | 202 +++
 5 files changed, 215 insertions(+)
 create mode 100644 libavcodec/x86/vvc/Makefile
 create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb42095165..ce33631b60 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -65,6 +65,7 @@ OBJS = ac3_parser.o   
  \
 
 # subsystems
 include $(SRC_PATH)/libavcodec/vvc/Makefile
+include $(SRC_PATH)/libavcodec/x86/vvc/Makefile
 OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o
 OBJS-$(CONFIG_AC3DSP)  += ac3dsp.o ac3.o ac3tab.o
 OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o 
mpeg4audio_sample_rates.o
diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c
index c82ea7be30..c542be5258 100644
--- a/libavcodec/vvc/vvcdsp.c
+++ b/libavcodec/vvc/vvcdsp.c
@@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
 VVC_DSP(8);
 break;
 }
+
+#if ARCH_X86
+ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
+#endif
 }
diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h
index b5a63c5833..6f59e73654 100644
--- a/libavcodec/vvc/vvcdsp.h
+++ b/libavcodec/vvc/vvcdsp.h
@@ -167,4 +167,6 @@ typedef struct VVCDSPContext {
 
 void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
 
+void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
+
 #endif /* AVCODEC_VVC_VVCDSP_H */
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
new file mode 100644
index 00..b4acc22501
--- /dev/null
+++ b/libavcodec/x86/vvc/Makefile
@@ -0,0 +1,6 @@
+clean::
+   $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
+
+OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
+X86ASM-OBJS-$(CONFIG_VVC_DECODER)  += x86/h26x/h2656dsp.o   \
+   
  x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
new file mode 100644
index 00..c197cdb4cc
--- /dev/null
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -0,0 +1,202 @@
+/*
+ * VVC DSP init for x86
+ *
+ * Copyright (C) 2022-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vvc/vvcdec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvcdsp.h"
+#include "libavcodec/x86/h26x/h2656dsp.h"
+
+#define FW_PUT(name, depth, opt) \
+static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
+ int height, const int8_t *hf, 
const int8_t *vf, int width)\
+{  
\
+ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, 
srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_TAP(fname, bitd, opt ) \
+FW_PUT(fname##4,   bitd, opt );   \
+FW_PUT(fname##8,   bitd, opt );   \
+FW_PUT(fname##16,  bitd, opt );   \
+FW_PUT(fname##32,  bitd, opt );   \
+FW_PUT(fname##64,  bitd, opt );   \
+FW_PUT(fname##128, bitd, opt );   \
+
+#define FW_PUT_4TAP(fname, bitd, opt) \
+FW_PUT(fname ## 2, bitd, opt) \
+FW_PUT_TAP(fname,  bitd, opt)
+
+#define FW_PUT_4TAP_SSE4(bitd)   \
+FW_PUT_4TAP(pixels,  bitd, sse4) \
+FW_PUT_4TAP(4tap_h,  bitd, sse4) \
+FW_PUT_4TAP(4tap_v,  bitd, sse4) \
+FW_PUT_4TAP(4tap_hv, bitd, sse4)
+
+#define FW_PUT_8TAP_SSE4(bitd)  \
+FW_PUT_TAP(8tap_h,  bitd, sse4) \
+FW_PUT_TAP(8tap_v,  bitd, sse4) \
+FW_PUT_TAP(8tap_hv, bitd, sse4)
+
+#define FW_PUT_SSE4(bitd)  \
+FW_PUT_4TAP_SSE4(bitd) \
+FW_PUT_8TAP_SSE4(bitd)
+
+FW_PUT_SSE4( 8);
+FW_PUT_SSE4(10);
+FW_PUT_SSE4(12);
+
+#define 

[FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations

2024-01-22 Thread toqsxw
From: Wu Jianhua 

The avg/avg_w is based on dav1d.
See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm

vvc_avg_8_2x2_c: 71.6
vvc_avg_8_2x2_avx2: 26.8
vvc_avg_8_2x4_c: 140.8
vvc_avg_8_2x4_avx2: 34.6
vvc_avg_8_2x8_c: 410.3
vvc_avg_8_2x8_avx2: 41.3
vvc_avg_8_2x16_c: 769.3
vvc_avg_8_2x16_avx2: 60.3
vvc_avg_8_2x32_c: 1669.6
vvc_avg_8_2x32_avx2: 105.1
vvc_avg_8_2x64_c: 1978.3
vvc_avg_8_2x64_avx2: 425.8
vvc_avg_8_2x128_c: 6536.8
vvc_avg_8_2x128_avx2: 1315.1
vvc_avg_8_4x2_c: 155.6
vvc_avg_8_4x2_avx2: 26.1
vvc_avg_8_4x4_c: 250.3
vvc_avg_8_4x4_avx2: 31.3
vvc_avg_8_4x8_c: 831.8
vvc_avg_8_4x8_avx2: 41.3
vvc_avg_8_4x16_c: 1461.1
vvc_avg_8_4x16_avx2: 57.1
vvc_avg_8_4x32_c: 2821.6
vvc_avg_8_4x32_avx2: 105.1
vvc_avg_8_4x64_c: 3615.8
vvc_avg_8_4x64_avx2: 412.6
vvc_avg_8_4x128_c: 11962.6
vvc_avg_8_4x128_avx2: 1274.3
vvc_avg_8_8x2_c: 215.8
vvc_avg_8_8x2_avx2: 29.1
vvc_avg_8_8x4_c: 430.6
vvc_avg_8_8x4_avx2: 37.6
vvc_avg_8_8x8_c: 1463.3
vvc_avg_8_8x8_avx2: 51.8
vvc_avg_8_8x16_c: 2630.1
vvc_avg_8_8x16_avx2: 97.6
vvc_avg_8_8x32_c: 5813.8
vvc_avg_8_8x32_avx2: 196.6
vvc_avg_8_8x64_c: 6687.3
vvc_avg_8_8x64_avx2: 487.8
vvc_avg_8_8x128_c: 13178.6
vvc_avg_8_8x128_avx2: 1290.6
vvc_avg_8_16x2_c: 443.8
vvc_avg_8_16x2_avx2: 28.3
vvc_avg_8_16x4_c: 1253.3
vvc_avg_8_16x4_avx2: 32.1
vvc_avg_8_16x8_c: 2236.3
vvc_avg_8_16x8_avx2: 44.3
vvc_avg_8_16x16_c: 5127.8
vvc_avg_8_16x16_avx2: 63.3
vvc_avg_8_16x32_c: 6573.3
vvc_avg_8_16x32_avx2: 223.6
vvc_avg_8_16x64_c: 30311.8
vvc_avg_8_16x64_avx2: 437.8
vvc_avg_8_16x128_c: 25693.3
vvc_avg_8_16x128_avx2: 1266.8
vvc_avg_8_32x2_c: 954.6
vvc_avg_8_32x2_avx2: 32.1
vvc_avg_8_32x4_c: 2359.6
vvc_avg_8_32x4_avx2: 39.6
vvc_avg_8_32x8_c: 5703.6
vvc_avg_8_32x8_avx2: 57.1
vvc_avg_8_32x16_c: 9967.6
vvc_avg_8_32x16_avx2: 107.1
vvc_avg_8_32x32_c: 21327.6
vvc_avg_8_32x32_avx2: 272.6
vvc_avg_8_32x64_c: 39240.8
vvc_avg_8_32x64_avx2: 529.6
vvc_avg_8_32x128_c: 52580.8
vvc_avg_8_32x128_avx2: 1338.8
vvc_avg_8_64x2_c: 1647.3
vvc_avg_8_64x2_avx2: 38.8
vvc_avg_8_64x4_c: 5130.1
vvc_avg_8_64x4_avx2: 58.8
vvc_avg_8_64x8_c: 6529.3
vvc_avg_8_64x8_avx2: 88.3
vvc_avg_8_64x16_c: 19913.6
vvc_avg_8_64x16_avx2: 162.3
vvc_avg_8_64x32_c: 39360.8
vvc_avg_8_64x32_avx2: 295.8
vvc_avg_8_64x64_c: 49658.3
vvc_avg_8_64x64_avx2: 784.1
vvc_avg_8_64x128_c: 108513.1
vvc_avg_8_64x128_avx2: 1977.1
vvc_avg_8_128x2_c: 3226.1
vvc_avg_8_128x2_avx2: 61.1
vvc_avg_8_128x4_c: 10280.3
vvc_avg_8_128x4_avx2: 94.6
vvc_avg_8_128x8_c: 18079.3
vvc_avg_8_128x8_avx2: 155.3
vvc_avg_8_128x16_c: 45121.8
vvc_avg_8_128x16_avx2: 285.3
vvc_avg_8_128x32_c: 48651.8
vvc_avg_8_128x32_avx2: 581.6
vvc_avg_8_128x64_c: 165078.6
vvc_avg_8_128x64_avx2: 1942.8
vvc_avg_8_128x128_c: 339103.1
vvc_avg_8_128x128_avx2: 4332.6
vvc_avg_10_2x2_c: 144.3
vvc_avg_10_2x2_avx2: 26.8
vvc_avg_10_2x4_c: 142.6
vvc_avg_10_2x4_avx2: 45.3
vvc_avg_10_2x8_c: 478.1
vvc_avg_10_2x8_avx2: 38.1
vvc_avg_10_2x16_c: 518.3
vvc_avg_10_2x16_avx2: 58.1
vvc_avg_10_2x32_c: 2059.8
vvc_avg_10_2x32_avx2: 93.1
vvc_avg_10_2x64_c: 2383.8
vvc_avg_10_2x64_avx2: 714.8
vvc_avg_10_2x128_c: 4498.3
vvc_avg_10_2x128_avx2: 1466.3
vvc_avg_10_4x2_c: 228.6
vvc_avg_10_4x2_avx2: 26.8
vvc_avg_10_4x4_c: 378.3
vvc_avg_10_4x4_avx2: 30.6
vvc_avg_10_4x8_c: 866.8
vvc_avg_10_4x8_avx2: 44.6
vvc_avg_10_4x16_c: 1018.1
vvc_avg_10_4x16_avx2: 58.1
vvc_avg_10_4x32_c: 3590.8
vvc_avg_10_4x32_avx2: 128.8
vvc_avg_10_4x64_c: 4200.8
vvc_avg_10_4x64_avx2: 663.6
vvc_avg_10_4x128_c: 8450.8
vvc_avg_10_4x128_avx2: 1531.8
vvc_avg_10_8x2_c: 369.3
vvc_avg_10_8x2_avx2: 28.3
vvc_avg_10_8x4_c: 513.8
vvc_avg_10_8x4_avx2: 32.1
vvc_avg_10_8x8_c: 1720.3
vvc_avg_10_8x8_avx2: 49.1
vvc_avg_10_8x16_c: 1894.8
vvc_avg_10_8x16_avx2: 71.6
vvc_avg_10_8x32_c: 3931.3
vvc_avg_10_8x32_avx2: 148.1
vvc_avg_10_8x64_c: 7964.3
vvc_avg_10_8x64_avx2: 613.1
vvc_avg_10_8x128_c: 15540.1
vvc_avg_10_8x128_avx2: 1585.1
vvc_avg_10_16x2_c: 877.3
vvc_avg_10_16x2_avx2: 27.6
vvc_avg_10_16x4_c: 955.8
vvc_avg_10_16x4_avx2: 29.8
vvc_avg_10_16x8_c: 3419.6
vvc_avg_10_16x8_avx2: 62.6
vvc_avg_10_16x16_c: 3826.8
vvc_avg_10_16x16_avx2: 54.3
vvc_avg_10_16x32_c: 7655.3
vvc_avg_10_16x32_avx2: 86.3
vvc_avg_10_16x64_c: 30011.1
vvc_avg_10_16x64_avx2: 692.6
vvc_avg_10_16x128_c: 47894.8
vvc_avg_10_16x128_avx2: 1580.3
vvc_avg_10_32x2_c: 944.3
vvc_avg_10_32x2_avx2: 29.8
vvc_avg_10_32x4_c: 2022.6
vvc_avg_10_32x4_avx2: 35.1
vvc_avg_10_32x8_c: 6148.8
vvc_avg_10_32x8_avx2: 51.3
vvc_avg_10_32x16_c: 12601.6
vvc_avg_10_32x16_avx2: 70.8
vvc_avg_10_32x32_c: 15958.6
vvc_avg_10_32x32_avx2: 124.3
vvc_avg_10_32x64_c: 31784.6
vvc_avg_10_32x64_avx2: 757.3
vvc_avg_10_32x128_c: 63892.8
vvc_avg_10_32x128_avx2: 1711.3
vvc_avg_10_64x2_c: 1890.8
vvc_avg_10_64x2_avx2: 34.3
vvc_avg_10_64x4_c: 6267.3
vvc_avg_10_64x4_avx2: 42.6
vvc_avg_10_64x8_c: 12778.1
vvc_avg_10_64x8_avx2: 67.8
vvc_avg_10_64x16_c: 22304.3
vvc_avg_10_64x16_avx2: 116.8
vvc_avg_10_64x32_c: 30777.1
vvc_avg_10_64x32_avx2: 201.1
vvc_avg_10_64x64_c: 60169.1
vvc_avg_10_64x64_avx2: 1454.3
vvc_avg_10_64x128_c: 124392.8
vvc_avg_10_64x128_avx2: 3648.6

[FFmpeg-devel] [PATCH v3 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/h26x/h2656_inter.asm | 32 ++---
 libavcodec/x86/h26x/h2656dsp.c  |  4 ++--
 libavcodec/x86/h26x/h2656dsp.h  |  2 +-
 libavcodec/x86/hevcdsp_init.c   |  2 +-
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index aa296d549c..cbba0c1ea5 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -22,8 +22,6 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 
-%define MAX_PB_SIZE 64
-
 SECTION_RODATA 32
 cextern pw_255
 cextern pw_512
@@ -342,7 +340,7 @@ SECTION .text
 %endmacro
 
 %macro LOOP_END 3
-add  %1q, 2*MAX_PB_SIZE  ; dst += dststride
+add  %1q, dststrideq ; dst += dststride
 add  %2q, %3q; src += srcstride
 dec  heightd ; cmp height
 jnz   .loop  ; height loop
@@ -539,7 +537,7 @@ SECTION .text
 
 
 ; **
-; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ; int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
@@ -549,7 +547,7 @@ SECTION .text
 %endmacro
 
 %macro MC_PIXELS 3
-cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height
+cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height
 pxor  m2, m2
 .loop:
 SIMPLE_LOAD   %2, %3, srcq, m0
@@ -579,10 +577,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, 
src, srcstride, height
 %endif
 
 ; **
-; void %1_put_4tap_hX(int16_t *dst,
+; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width);
 ; **
-cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf
+cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, 
height, hf
 %assign %%stride ((%3 + 7)/8)
 MC_4TAP_FILTER   %3, hf, m4, m5
 .loop:
@@ -612,10 +610,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 RET
 
 ; **
-; void %1_put_4tap_v(int16_t *dst,
+; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, 
r3src, vf
+cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, 
height, r3src, vf
 sub srcq, srcstrideq
 MC_4TAP_FILTER%3, vf, m4, m5
 lea   r3srcq, [srcstrideq*3]
@@ -649,10 +647,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 
 %macro PUT_4TAP_HV 3
 ; **
-; void put_4tap_hv(int16_t *dst,
+; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, 
r3src
+cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, 
height, hf, vf, r3src
 %assign %%stride ((%3 + 7)/8)
 sub srcq, srcstrideq
 MC_4TAP_HV_FILTER%3
@@ -784,12 +782,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, 
dststride, src, srcstride, heig
 %endmacro
 
 ; **
-; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;   int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
 %macro PUT_8TAP 3
-cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf
+cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, 
hf
 MC_8TAP_FILTER  %3, hf
 .loop:
 MC_8TAP_H_LOAD  %3, srcq, %2, 10
@@ -824,10 +822,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, 
dststride, src, srcstride, heigh
 
 
 ; **
-; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;  int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
-cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf
+cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height, 

[FFmpeg-devel] [PATCH v3 6/8] tests/checkasm: add checkasm_check_vvc_mc

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_mc.c   | 270 ++
 4 files changed, 275 insertions(+)
 create mode 100644 tests/checkasm/vvc_mc.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3b5b54352b..3562acb2b2 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_mc.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 87f24c77ca..36a97957e5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -194,6 +194,9 @@ static const struct {
 #if CONFIG_VORBIS_DECODER
 { "vorbisdsp", checkasm_check_vorbisdsp },
 #endif
+#if CONFIG_VVC_DECODER
+{ "vvc_mc", checkasm_check_vvc_mc },
+#endif
 #endif
 #if CONFIG_AVFILTER
 #if CONFIG_AFIR_FILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4db8c495ea..53cb3ccfbf 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_mc(void);
 
 struct CheckasmPerf;
 
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
new file mode 100644
index 00..711280deec
--- /dev/null
+++ b/tests/checkasm/vvc_mc.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvc_data.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
+static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
+
+#define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
+#define EXTRA_BEFORE 3
+#define EXTRA_AFTER  4
+#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2
+#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA)
+#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2)
+#define SRC_OFFSET   ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE)
+
+#define randomize_buffers(buf0, buf1, size, mask)   \
+do {\
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+#define randomize_pixels(buf0, buf1, size)  \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+#define randomize_avg_src(buf0, buf1, size) \
+do {\
+uint32_t mask = 0x3fff3fff; \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+static void check_put_vvc_luma(void)
+{
+LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+VVCDSPContext c;
+
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t 
*dst, const uint8_t *src, const ptrdiff_t src_stride,
+  

[FFmpeg-devel] [PATCH v3 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/hevcdsp_template.c | 594 +++---
 1 file changed, 46 insertions(+), 548 deletions(-)

diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 0de14e9dcf..9b48bdf08e 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -26,6 +26,7 @@
 #include "bit_depth_template.c"
 #include "hevcdsp.h"
 #include "h26x/h2656_sao_template.c"
+#include "h26x/h2656_inter_template.c"
 
 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int 
height,
   GetBitContext *gb, int pcm_bit_depth)
@@ -299,37 +300,51 @@ IDCT_DC(32)
 

 //
 

-static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-  const uint8_t *_src, ptrdiff_t 
_srcstride,
-  int height, intptr_t mx, intptr_t my, 
int width)
-{
-int x, y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-for (x = 0; x < width; x++)
-dst[x] = src[x] << (14 - BIT_DEPTH);
-src += srcstride;
-dst += MAX_PB_SIZE;
-}
-}
-
-static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, 
const uint8_t *_src, ptrdiff_t _srcstride,
-  int height, intptr_t mx, intptr_t 
my, int width)
-{
-int y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-pixel *dst  = (pixel *)_dst;
-ptrdiff_t dststride = _dststride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-memcpy(dst, src, width * sizeof(pixel));
-src += srcstride;
-dst += dststride;
-}
-}
+#define ff_hevc_pel_filters ff_hevc_qpel_filters
+#define DECL_HV_FILTER(f)  \
+const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
+const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
+
+#define FW_PUT(p, f, t)
   \
+static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t 
srcstride, int height,\
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width);   
   \
+}
+
+#define FW_PUT_UNI(p, f, t)
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride, int height, intptr_t 
mx, intptr_t my, int width)   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width);
   \
+}
+
+#define FW_PUT_UNI_W(p, f, t)  
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride,int height, int denom, 
int wx, int ox,  \
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, 
vf, width);\
+}
+
+#define FW_PUT_FUNCS(f, t, dir)   \
+FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \
+FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\
+FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir)
+
+FW_PUT(pel, pel_pixels, pixels)
+FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels)
+FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels)
+
+FW_PUT_FUNCS(qpel, luma,   h )
+FW_PUT_FUNCS(qpel, luma,   v )
+FW_PUT_FUNCS(qpel, luma,   hv)
+FW_PUT_FUNCS(epel, chroma, h )
+FW_PUT_FUNCS(epel, chroma, v )
+FW_PUT_FUNCS(epel, chroma, hv)
 
 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t 

[FFmpeg-devel] [PATCH v3 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm

2024-01-22 Thread toqsxw
From: Wu Jianhua 

This enable that the asm optimization can be reused by VVC

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/Makefile |1 +
 libavcodec/x86/h26x/h2656_inter.asm | 1145 +++
 libavcodec/x86/h26x/h2656dsp.c  |   98 +++
 libavcodec/x86/h26x/h2656dsp.h  |  103 +++
 libavcodec/x86/hevc_mc.asm  |  462 +--
 libavcodec/x86/hevcdsp_init.c   |  108 ++-
 6 files changed, 1471 insertions(+), 446 deletions(-)
 create mode 100644 libavcodec/x86/h26x/h2656_inter.asm
 create mode 100644 libavcodec/x86/h26x/h2656dsp.c
 create mode 100644 libavcodec/x86/h26x/h2656dsp.h

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d5fb30645a..8098cd840c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += 
x86/hevc_add_res.o\
   x86/hevc_deblock.o\
   x86/hevc_idct.o   \
   x86/hevc_mc.o \
+  x86/h26x/h2656_inter.o\
   x86/hevc_sao.o\
   x86/hevc_sao_10bit.o
 X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
new file mode 100644
index 00..aa296d549c
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -0,0 +1,1145 @@
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 64
+
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+cextern pw_8192
+%define scale_8 pw_512
+%define scale_10 pw_2048
+%define scale_12 pw_8192
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+max_pixels_12:  times 16 dw ((1 << 12)-1)
+cextern pb_0
+
+SECTION .text
+%macro SIMPLE_LOAD 4;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+movd  %4, [%3]   ; 
load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+movq  %4, [%3]   ; 
load data from source
+%elif notcpuflag(avx)
+movu  %4, [%3]   ; 
load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+movdqu   %4, [%3]
+%else
+movu  %4, [%3]
+%endif
+%endmacro
+
+%macro VPBROADCASTW 2
+%if notcpuflag(avx2)
+movd   %1, %2
+pshuflw%1, %1, 0
+punpcklwd  %1, %1
+%else
+vpbroadcastw   %1, %2
+%endif
+%endmacro
+
+%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b,
+VPBROADCASTW   %3, [%2q + 0 * 2]  ; coeff 0, 1
+VPBROADCASTW   %4, [%2q + 1 * 2]  ; coeff 2, 3
+%if %1 != 8
+pmovsxbw   %3, xmm%3
+pmovsxbw   %4, xmm%4
+%endif
+%endmacro
+
+%macro MC_4TAP_HV_FILTER 1
+VPBROADCASTW  m12, [vfq + 0 * 2]  ; vf 0, 1
+VPBROADCASTW  m13, [vfq + 1 * 2]  ; vf 2, 3
+VPBROADCASTW  m14, [hfq + 0 * 2]  ; hf 0, 1
+VPBROADCASTW  m15, [hfq + 1 * 2]  ; hf 2, 3
+
+pmovsxbw  m12, xm12
+pmovsxbw  m13, xm13
+%if %1 != 8
+pmovsxbw  m14, xm14
+pmovsxbw  m15, xm15
+%endif
+lea   r3srcq, [srcstrideq*3]
+%endmacro
+
+%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers
+mova [rsp + %1 + 0*mmsize], %2
+mova [rsp + %1 + 1*mmsize], %3
+mova [rsp + %1 + 2*mmsize], %4
+mova [rsp + %1 + 3*mmsize], %5
+%endmacro
+
+%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset
+VPBROADCASTW  m12, [%2q + 0 * 2]  ; coeff 0, 1
+VPBROADCASTW  m13, [%2q + 1 * 2]  ; coeff 2, 3
+VPBROADCASTW  m14, [%2q + 2 * 2]  ; coeff 4, 5
+VPBROADCASTW  m15, [%2q + 3 * 2] 

[FFmpeg-devel] [PATCH v3 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/h26x/h2656_inter_template.c | 577 +
 libavcodec/vvc/vvc_inter_template.c| 559 +---
 2 files changed, 578 insertions(+), 558 deletions(-)
 create mode 100644 libavcodec/h26x/h2656_inter_template.c

diff --git a/libavcodec/h26x/h2656_inter_template.c 
b/libavcodec/h26x/h2656_inter_template.c
new file mode 100644
index 00..864f6c7e7d
--- /dev/null
+++ b/libavcodec/h26x/h2656_inter_template.c
@@ -0,0 +1,577 @@
+/*
+ * inter prediction template for HEVC/VVC
+ *
+ * Copyright (C) 2022 Nuo Mi
+ * Copyright (C) 2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CHROMA_EXTRA_BEFORE 1
+#define CHROMA_EXTRA3
+#define LUMA_EXTRA_BEFORE   3
+#define LUMA_EXTRA  7
+
+static void FUNC(put_pixels)(int16_t *dst,
+const uint8_t *_src, const ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = src[x] << (14 - BIT_DEPTH);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+ const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+memcpy(dst, src, width * sizeof(pixel));
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+const int denom, const int wx, const int _ox,  const int8_t *hf, const 
int8_t *vf,
+const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+const int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+const int offset= 1 << (shift - 1);
+#else
+const int offset= 0;
+#endif
+const int ox= _ox * (1 << (BIT_DEPTH - 8));
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++) {
+const int v = (src[x] << (14 - BIT_DEPTH));
+dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox);
+}
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+#define LUMA_FILTER(src, stride)   
\
+(filter[0] * src[x - 3 * stride] + 
\
+ filter[1] * src[x - 2 * stride] + 
\
+ filter[2] * src[x - stride] + 
\
+ filter[3] * src[x ] + 
\
+ filter[4] * src[x + stride] + 
\
+ filter[5] * src[x + 2 * stride] + 
\
+ filter[6] * src[x + 3 * stride] + 
\
+ filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src   = (const pixel*)_src;
+const ptrdiff_t src_stride = _src_stride / sizeof(pixel);
+const int8_t *filter   = hf;
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void 

[FFmpeg-devel] [PATCH v3 6/8] tests/checkasm: add checkasm_check_vvc_mc

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_mc.c   | 270 ++
 4 files changed, 275 insertions(+)
 create mode 100644 tests/checkasm/vvc_mc.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3b5b54352b..3562acb2b2 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_mc.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 87f24c77ca..36a97957e5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -194,6 +194,9 @@ static const struct {
 #if CONFIG_VORBIS_DECODER
 { "vorbisdsp", checkasm_check_vorbisdsp },
 #endif
+#if CONFIG_VVC_DECODER
+{ "vvc_mc", checkasm_check_vvc_mc },
+#endif
 #endif
 #if CONFIG_AVFILTER
 #if CONFIG_AFIR_FILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4db8c495ea..53cb3ccfbf 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_mc(void);
 
 struct CheckasmPerf;
 
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
new file mode 100644
index 00..711280deec
--- /dev/null
+++ b/tests/checkasm/vvc_mc.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvc_data.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
+static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
+
+#define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
+#define EXTRA_BEFORE 3
+#define EXTRA_AFTER  4
+#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2
+#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA)
+#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2)
+#define SRC_OFFSET   ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE)
+
+#define randomize_buffers(buf0, buf1, size, mask)   \
+do {\
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+#define randomize_pixels(buf0, buf1, size)  \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+#define randomize_avg_src(buf0, buf1, size) \
+do {\
+uint32_t mask = 0x3fff3fff; \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+static void check_put_vvc_luma(void)
+{
+LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+VVCDSPContext c;
+
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t 
*dst, const uint8_t *src, const ptrdiff_t src_stride,
+  

[FFmpeg-devel] [PATCH v3 8/8] tests/checkasm/vvc_mc: add check_avg

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_mc.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 711280deec..8adb00573f 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -35,6 +35,7 @@
 static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
 static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
 
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
 #define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
 #define EXTRA_BEFORE 3
 #define EXTRA_AFTER  4
@@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void)
 report("put_uni_chroma");
 }
 
+#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE)
+#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2)
+
+static void check_avg(void)
+{
+LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]);
+VVCDSPContext c;
+
+for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+ff_vvc_dsp_init(, bit_depth);
+for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) {
+for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) {
+{
+   declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height);
+if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, 
h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h);
+}
+}
+{
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height,
+int denom, int w0, int w1, int o0, int o1);
+{
+const int denom = rnd() % 8;
+const int w0= rnd() % 256 - 128;
+const int w1= rnd() % 256 - 128;
+const int o0= rnd() % 256 - 128;
+const int o1= rnd() % 256 - 128;
+if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", 
bit_depth, w, h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h, denom, w0, w1, o0, o1);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h, denom, w0, w1, o0, o1);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h, denom, w0, w1, o0, o1);
+}
+}
+}
+}
+}
+}
+report("avg");
+}
+
 void checkasm_check_vvc_mc(void)
 {
 check_put_vvc_luma();
 check_put_vvc_luma_uni();
 check_put_vvc_chroma();
 check_put_vvc_chroma_uni();
+check_avg();
 }
-- 
2.34.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations

2024-01-22 Thread toqsxw
From: Wu Jianhua 

The avg/avg_w is based on dav1d.
See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm

vvc_avg_8_2x2_c: 71.6
vvc_avg_8_2x2_avx2: 26.8
vvc_avg_8_2x4_c: 140.8
vvc_avg_8_2x4_avx2: 34.6
vvc_avg_8_2x8_c: 410.3
vvc_avg_8_2x8_avx2: 41.3
vvc_avg_8_2x16_c: 769.3
vvc_avg_8_2x16_avx2: 60.3
vvc_avg_8_2x32_c: 1669.6
vvc_avg_8_2x32_avx2: 105.1
vvc_avg_8_2x64_c: 1978.3
vvc_avg_8_2x64_avx2: 425.8
vvc_avg_8_2x128_c: 6536.8
vvc_avg_8_2x128_avx2: 1315.1
vvc_avg_8_4x2_c: 155.6
vvc_avg_8_4x2_avx2: 26.1
vvc_avg_8_4x4_c: 250.3
vvc_avg_8_4x4_avx2: 31.3
vvc_avg_8_4x8_c: 831.8
vvc_avg_8_4x8_avx2: 41.3
vvc_avg_8_4x16_c: 1461.1
vvc_avg_8_4x16_avx2: 57.1
vvc_avg_8_4x32_c: 2821.6
vvc_avg_8_4x32_avx2: 105.1
vvc_avg_8_4x64_c: 3615.8
vvc_avg_8_4x64_avx2: 412.6
vvc_avg_8_4x128_c: 11962.6
vvc_avg_8_4x128_avx2: 1274.3
vvc_avg_8_8x2_c: 215.8
vvc_avg_8_8x2_avx2: 29.1
vvc_avg_8_8x4_c: 430.6
vvc_avg_8_8x4_avx2: 37.6
vvc_avg_8_8x8_c: 1463.3
vvc_avg_8_8x8_avx2: 51.8
vvc_avg_8_8x16_c: 2630.1
vvc_avg_8_8x16_avx2: 97.6
vvc_avg_8_8x32_c: 5813.8
vvc_avg_8_8x32_avx2: 196.6
vvc_avg_8_8x64_c: 6687.3
vvc_avg_8_8x64_avx2: 487.8
vvc_avg_8_8x128_c: 13178.6
vvc_avg_8_8x128_avx2: 1290.6
vvc_avg_8_16x2_c: 443.8
vvc_avg_8_16x2_avx2: 28.3
vvc_avg_8_16x4_c: 1253.3
vvc_avg_8_16x4_avx2: 32.1
vvc_avg_8_16x8_c: 2236.3
vvc_avg_8_16x8_avx2: 44.3
vvc_avg_8_16x16_c: 5127.8
vvc_avg_8_16x16_avx2: 63.3
vvc_avg_8_16x32_c: 6573.3
vvc_avg_8_16x32_avx2: 223.6
vvc_avg_8_16x64_c: 30311.8
vvc_avg_8_16x64_avx2: 437.8
vvc_avg_8_16x128_c: 25693.3
vvc_avg_8_16x128_avx2: 1266.8
vvc_avg_8_32x2_c: 954.6
vvc_avg_8_32x2_avx2: 32.1
vvc_avg_8_32x4_c: 2359.6
vvc_avg_8_32x4_avx2: 39.6
vvc_avg_8_32x8_c: 5703.6
vvc_avg_8_32x8_avx2: 57.1
vvc_avg_8_32x16_c: 9967.6
vvc_avg_8_32x16_avx2: 107.1
vvc_avg_8_32x32_c: 21327.6
vvc_avg_8_32x32_avx2: 272.6
vvc_avg_8_32x64_c: 39240.8
vvc_avg_8_32x64_avx2: 529.6
vvc_avg_8_32x128_c: 52580.8
vvc_avg_8_32x128_avx2: 1338.8
vvc_avg_8_64x2_c: 1647.3
vvc_avg_8_64x2_avx2: 38.8
vvc_avg_8_64x4_c: 5130.1
vvc_avg_8_64x4_avx2: 58.8
vvc_avg_8_64x8_c: 6529.3
vvc_avg_8_64x8_avx2: 88.3
vvc_avg_8_64x16_c: 19913.6
vvc_avg_8_64x16_avx2: 162.3
vvc_avg_8_64x32_c: 39360.8
vvc_avg_8_64x32_avx2: 295.8
vvc_avg_8_64x64_c: 49658.3
vvc_avg_8_64x64_avx2: 784.1
vvc_avg_8_64x128_c: 108513.1
vvc_avg_8_64x128_avx2: 1977.1
vvc_avg_8_128x2_c: 3226.1
vvc_avg_8_128x2_avx2: 61.1
vvc_avg_8_128x4_c: 10280.3
vvc_avg_8_128x4_avx2: 94.6
vvc_avg_8_128x8_c: 18079.3
vvc_avg_8_128x8_avx2: 155.3
vvc_avg_8_128x16_c: 45121.8
vvc_avg_8_128x16_avx2: 285.3
vvc_avg_8_128x32_c: 48651.8
vvc_avg_8_128x32_avx2: 581.6
vvc_avg_8_128x64_c: 165078.6
vvc_avg_8_128x64_avx2: 1942.8
vvc_avg_8_128x128_c: 339103.1
vvc_avg_8_128x128_avx2: 4332.6
vvc_avg_10_2x2_c: 144.3
vvc_avg_10_2x2_avx2: 26.8
vvc_avg_10_2x4_c: 142.6
vvc_avg_10_2x4_avx2: 45.3
vvc_avg_10_2x8_c: 478.1
vvc_avg_10_2x8_avx2: 38.1
vvc_avg_10_2x16_c: 518.3
vvc_avg_10_2x16_avx2: 58.1
vvc_avg_10_2x32_c: 2059.8
vvc_avg_10_2x32_avx2: 93.1
vvc_avg_10_2x64_c: 2383.8
vvc_avg_10_2x64_avx2: 714.8
vvc_avg_10_2x128_c: 4498.3
vvc_avg_10_2x128_avx2: 1466.3
vvc_avg_10_4x2_c: 228.6
vvc_avg_10_4x2_avx2: 26.8
vvc_avg_10_4x4_c: 378.3
vvc_avg_10_4x4_avx2: 30.6
vvc_avg_10_4x8_c: 866.8
vvc_avg_10_4x8_avx2: 44.6
vvc_avg_10_4x16_c: 1018.1
vvc_avg_10_4x16_avx2: 58.1
vvc_avg_10_4x32_c: 3590.8
vvc_avg_10_4x32_avx2: 128.8
vvc_avg_10_4x64_c: 4200.8
vvc_avg_10_4x64_avx2: 663.6
vvc_avg_10_4x128_c: 8450.8
vvc_avg_10_4x128_avx2: 1531.8
vvc_avg_10_8x2_c: 369.3
vvc_avg_10_8x2_avx2: 28.3
vvc_avg_10_8x4_c: 513.8
vvc_avg_10_8x4_avx2: 32.1
vvc_avg_10_8x8_c: 1720.3
vvc_avg_10_8x8_avx2: 49.1
vvc_avg_10_8x16_c: 1894.8
vvc_avg_10_8x16_avx2: 71.6
vvc_avg_10_8x32_c: 3931.3
vvc_avg_10_8x32_avx2: 148.1
vvc_avg_10_8x64_c: 7964.3
vvc_avg_10_8x64_avx2: 613.1
vvc_avg_10_8x128_c: 15540.1
vvc_avg_10_8x128_avx2: 1585.1
vvc_avg_10_16x2_c: 877.3
vvc_avg_10_16x2_avx2: 27.6
vvc_avg_10_16x4_c: 955.8
vvc_avg_10_16x4_avx2: 29.8
vvc_avg_10_16x8_c: 3419.6
vvc_avg_10_16x8_avx2: 62.6
vvc_avg_10_16x16_c: 3826.8
vvc_avg_10_16x16_avx2: 54.3
vvc_avg_10_16x32_c: 7655.3
vvc_avg_10_16x32_avx2: 86.3
vvc_avg_10_16x64_c: 30011.1
vvc_avg_10_16x64_avx2: 692.6
vvc_avg_10_16x128_c: 47894.8
vvc_avg_10_16x128_avx2: 1580.3
vvc_avg_10_32x2_c: 944.3
vvc_avg_10_32x2_avx2: 29.8
vvc_avg_10_32x4_c: 2022.6
vvc_avg_10_32x4_avx2: 35.1
vvc_avg_10_32x8_c: 6148.8
vvc_avg_10_32x8_avx2: 51.3
vvc_avg_10_32x16_c: 12601.6
vvc_avg_10_32x16_avx2: 70.8
vvc_avg_10_32x32_c: 15958.6
vvc_avg_10_32x32_avx2: 124.3
vvc_avg_10_32x64_c: 31784.6
vvc_avg_10_32x64_avx2: 757.3
vvc_avg_10_32x128_c: 63892.8
vvc_avg_10_32x128_avx2: 1711.3
vvc_avg_10_64x2_c: 1890.8
vvc_avg_10_64x2_avx2: 34.3
vvc_avg_10_64x4_c: 6267.3
vvc_avg_10_64x4_avx2: 42.6
vvc_avg_10_64x8_c: 12778.1
vvc_avg_10_64x8_avx2: 67.8
vvc_avg_10_64x16_c: 22304.3
vvc_avg_10_64x16_avx2: 116.8
vvc_avg_10_64x32_c: 30777.1
vvc_avg_10_64x32_avx2: 201.1
vvc_avg_10_64x64_c: 60169.1
vvc_avg_10_64x64_avx2: 1454.3
vvc_avg_10_64x128_c: 124392.8
vvc_avg_10_64x128_avx2: 3648.6

[FFmpeg-devel] [PATCH v3 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/Makefile  |   1 +
 libavcodec/vvc/vvcdsp.c  |   4 +
 libavcodec/vvc/vvcdsp.h  |   2 +
 libavcodec/x86/vvc/Makefile  |   6 +
 libavcodec/x86/vvc/vvcdsp_init.c | 202 +++
 5 files changed, 215 insertions(+)
 create mode 100644 libavcodec/x86/vvc/Makefile
 create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb42095165..ce33631b60 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -65,6 +65,7 @@ OBJS = ac3_parser.o   
  \
 
 # subsystems
 include $(SRC_PATH)/libavcodec/vvc/Makefile
+include $(SRC_PATH)/libavcodec/x86/vvc/Makefile
 OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o
 OBJS-$(CONFIG_AC3DSP)  += ac3dsp.o ac3.o ac3tab.o
 OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o 
mpeg4audio_sample_rates.o
diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c
index c82ea7be30..c542be5258 100644
--- a/libavcodec/vvc/vvcdsp.c
+++ b/libavcodec/vvc/vvcdsp.c
@@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
 VVC_DSP(8);
 break;
 }
+
+#if ARCH_X86
+ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
+#endif
 }
diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h
index b5a63c5833..6f59e73654 100644
--- a/libavcodec/vvc/vvcdsp.h
+++ b/libavcodec/vvc/vvcdsp.h
@@ -167,4 +167,6 @@ typedef struct VVCDSPContext {
 
 void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
 
+void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
+
 #endif /* AVCODEC_VVC_VVCDSP_H */
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
new file mode 100644
index 00..b4acc22501
--- /dev/null
+++ b/libavcodec/x86/vvc/Makefile
@@ -0,0 +1,6 @@
+clean::
+   $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
+
+OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
+X86ASM-OBJS-$(CONFIG_VVC_DECODER)  += x86/h26x/h2656dsp.o   \
+   
  x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
new file mode 100644
index 00..c197cdb4cc
--- /dev/null
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -0,0 +1,202 @@
+/*
+ * VVC DSP init for x86
+ *
+ * Copyright (C) 2022-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vvc/vvcdec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvcdsp.h"
+#include "libavcodec/x86/h26x/h2656dsp.h"
+
+#define FW_PUT(name, depth, opt) \
+static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
+ int height, const int8_t *hf, 
const int8_t *vf, int width)\
+{  
\
+ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, 
srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_TAP(fname, bitd, opt ) \
+FW_PUT(fname##4,   bitd, opt );   \
+FW_PUT(fname##8,   bitd, opt );   \
+FW_PUT(fname##16,  bitd, opt );   \
+FW_PUT(fname##32,  bitd, opt );   \
+FW_PUT(fname##64,  bitd, opt );   \
+FW_PUT(fname##128, bitd, opt );   \
+
+#define FW_PUT_4TAP(fname, bitd, opt) \
+FW_PUT(fname ## 2, bitd, opt) \
+FW_PUT_TAP(fname,  bitd, opt)
+
+#define FW_PUT_4TAP_SSE4(bitd)   \
+FW_PUT_4TAP(pixels,  bitd, sse4) \
+FW_PUT_4TAP(4tap_h,  bitd, sse4) \
+FW_PUT_4TAP(4tap_v,  bitd, sse4) \
+FW_PUT_4TAP(4tap_hv, bitd, sse4)
+
+#define FW_PUT_8TAP_SSE4(bitd)  \
+FW_PUT_TAP(8tap_h,  bitd, sse4) \
+FW_PUT_TAP(8tap_v,  bitd, sse4) \
+FW_PUT_TAP(8tap_hv, bitd, sse4)
+
+#define FW_PUT_SSE4(bitd)  \
+FW_PUT_4TAP_SSE4(bitd) \
+FW_PUT_8TAP_SSE4(bitd)
+
+FW_PUT_SSE4( 8);
+FW_PUT_SSE4(10);
+FW_PUT_SSE4(12);
+
+#define 

[FFmpeg-devel] [PATCH v3 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/h26x/h2656_inter.asm | 32 ++---
 libavcodec/x86/h26x/h2656dsp.c  |  4 ++--
 libavcodec/x86/h26x/h2656dsp.h  |  2 +-
 libavcodec/x86/hevcdsp_init.c   |  2 +-
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index aa296d549c..cbba0c1ea5 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -22,8 +22,6 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 
-%define MAX_PB_SIZE 64
-
 SECTION_RODATA 32
 cextern pw_255
 cextern pw_512
@@ -342,7 +340,7 @@ SECTION .text
 %endmacro
 
 %macro LOOP_END 3
-add  %1q, 2*MAX_PB_SIZE  ; dst += dststride
+add  %1q, dststrideq ; dst += dststride
 add  %2q, %3q; src += srcstride
 dec  heightd ; cmp height
 jnz   .loop  ; height loop
@@ -539,7 +537,7 @@ SECTION .text
 
 
 ; **
-; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ; int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
@@ -549,7 +547,7 @@ SECTION .text
 %endmacro
 
 %macro MC_PIXELS 3
-cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height
+cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height
 pxor  m2, m2
 .loop:
 SIMPLE_LOAD   %2, %3, srcq, m0
@@ -579,10 +577,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, 
src, srcstride, height
 %endif
 
 ; **
-; void %1_put_4tap_hX(int16_t *dst,
+; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width);
 ; **
-cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf
+cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, 
height, hf
 %assign %%stride ((%3 + 7)/8)
 MC_4TAP_FILTER   %3, hf, m4, m5
 .loop:
@@ -612,10 +610,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 RET
 
 ; **
-; void %1_put_4tap_v(int16_t *dst,
+; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, 
r3src, vf
+cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, 
height, r3src, vf
 sub srcq, srcstrideq
 MC_4TAP_FILTER%3, vf, m4, m5
 lea   r3srcq, [srcstrideq*3]
@@ -649,10 +647,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 
 %macro PUT_4TAP_HV 3
 ; **
-; void put_4tap_hv(int16_t *dst,
+; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, 
r3src
+cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, 
height, hf, vf, r3src
 %assign %%stride ((%3 + 7)/8)
 sub srcq, srcstrideq
 MC_4TAP_HV_FILTER%3
@@ -784,12 +782,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, 
dststride, src, srcstride, heig
 %endmacro
 
 ; **
-; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;   int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
 %macro PUT_8TAP 3
-cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf
+cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, 
hf
 MC_8TAP_FILTER  %3, hf
 .loop:
 MC_8TAP_H_LOAD  %3, srcq, %2, 10
@@ -824,10 +822,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, 
dststride, src, srcstride, heigh
 
 
 ; **
-; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;  int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
-cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf
+cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height, 

[FFmpeg-devel] [PATCH v3 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm

2024-01-22 Thread toqsxw
From: Wu Jianhua 

This enable that the asm optimization can be reused by VVC

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/Makefile |1 +
 libavcodec/x86/h26x/h2656_inter.asm | 1145 +++
 libavcodec/x86/h26x/h2656dsp.c  |   98 +++
 libavcodec/x86/h26x/h2656dsp.h  |  103 +++
 libavcodec/x86/hevc_mc.asm  |  462 +--
 libavcodec/x86/hevcdsp_init.c   |  108 ++-
 6 files changed, 1471 insertions(+), 446 deletions(-)
 create mode 100644 libavcodec/x86/h26x/h2656_inter.asm
 create mode 100644 libavcodec/x86/h26x/h2656dsp.c
 create mode 100644 libavcodec/x86/h26x/h2656dsp.h

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d5fb30645a..8098cd840c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += 
x86/hevc_add_res.o\
   x86/hevc_deblock.o\
   x86/hevc_idct.o   \
   x86/hevc_mc.o \
+  x86/h26x/h2656_inter.o\
   x86/hevc_sao.o\
   x86/hevc_sao_10bit.o
 X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
new file mode 100644
index 00..aa296d549c
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -0,0 +1,1145 @@
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 64
+
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+cextern pw_8192
+%define scale_8 pw_512
+%define scale_10 pw_2048
+%define scale_12 pw_8192
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+max_pixels_12:  times 16 dw ((1 << 12)-1)
+cextern pb_0
+
+SECTION .text
+%macro SIMPLE_LOAD 4;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+movd  %4, [%3]   ; 
load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+movq  %4, [%3]   ; 
load data from source
+%elif notcpuflag(avx)
+movu  %4, [%3]   ; 
load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+movdqu   %4, [%3]
+%else
+movu  %4, [%3]
+%endif
+%endmacro
+
+%macro VPBROADCASTW 2
+%if notcpuflag(avx2)
+movd   %1, %2
+pshuflw%1, %1, 0
+punpcklwd  %1, %1
+%else
+vpbroadcastw   %1, %2
+%endif
+%endmacro
+
+%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b,
+VPBROADCASTW   %3, [%2q + 0 * 2]  ; coeff 0, 1
+VPBROADCASTW   %4, [%2q + 1 * 2]  ; coeff 2, 3
+%if %1 != 8
+pmovsxbw   %3, xmm%3
+pmovsxbw   %4, xmm%4
+%endif
+%endmacro
+
+%macro MC_4TAP_HV_FILTER 1
+VPBROADCASTW  m12, [vfq + 0 * 2]  ; vf 0, 1
+VPBROADCASTW  m13, [vfq + 1 * 2]  ; vf 2, 3
+VPBROADCASTW  m14, [hfq + 0 * 2]  ; hf 0, 1
+VPBROADCASTW  m15, [hfq + 1 * 2]  ; hf 2, 3
+
+pmovsxbw  m12, xm12
+pmovsxbw  m13, xm13
+%if %1 != 8
+pmovsxbw  m14, xm14
+pmovsxbw  m15, xm15
+%endif
+lea   r3srcq, [srcstrideq*3]
+%endmacro
+
+%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers
+mova [rsp + %1 + 0*mmsize], %2
+mova [rsp + %1 + 1*mmsize], %3
+mova [rsp + %1 + 2*mmsize], %4
+mova [rsp + %1 + 3*mmsize], %5
+%endmacro
+
+%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset
+VPBROADCASTW  m12, [%2q + 0 * 2]  ; coeff 0, 1
+VPBROADCASTW  m13, [%2q + 1 * 2]  ; coeff 2, 3
+VPBROADCASTW  m14, [%2q + 2 * 2]  ; coeff 4, 5
+VPBROADCASTW  m15, [%2q + 3 * 2] 

[FFmpeg-devel] [PATCH v3 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/hevcdsp_template.c | 594 +++---
 1 file changed, 46 insertions(+), 548 deletions(-)

diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 0de14e9dcf..9b48bdf08e 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -26,6 +26,7 @@
 #include "bit_depth_template.c"
 #include "hevcdsp.h"
 #include "h26x/h2656_sao_template.c"
+#include "h26x/h2656_inter_template.c"
 
 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int 
height,
   GetBitContext *gb, int pcm_bit_depth)
@@ -299,37 +300,51 @@ IDCT_DC(32)
 

 //
 

-static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-  const uint8_t *_src, ptrdiff_t 
_srcstride,
-  int height, intptr_t mx, intptr_t my, 
int width)
-{
-int x, y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-for (x = 0; x < width; x++)
-dst[x] = src[x] << (14 - BIT_DEPTH);
-src += srcstride;
-dst += MAX_PB_SIZE;
-}
-}
-
-static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, 
const uint8_t *_src, ptrdiff_t _srcstride,
-  int height, intptr_t mx, intptr_t 
my, int width)
-{
-int y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-pixel *dst  = (pixel *)_dst;
-ptrdiff_t dststride = _dststride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-memcpy(dst, src, width * sizeof(pixel));
-src += srcstride;
-dst += dststride;
-}
-}
+#define ff_hevc_pel_filters ff_hevc_qpel_filters
+#define DECL_HV_FILTER(f)  \
+const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
+const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
+
+#define FW_PUT(p, f, t)
   \
+static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t 
srcstride, int height,\
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width);   
   \
+}
+
+#define FW_PUT_UNI(p, f, t)
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride, int height, intptr_t 
mx, intptr_t my, int width)   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width);
   \
+}
+
+#define FW_PUT_UNI_W(p, f, t)  
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride,int height, int denom, 
int wx, int ox,  \
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, 
vf, width);\
+}
+
+#define FW_PUT_FUNCS(f, t, dir)   \
+FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \
+FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\
+FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir)
+
+FW_PUT(pel, pel_pixels, pixels)
+FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels)
+FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels)
+
+FW_PUT_FUNCS(qpel, luma,   h )
+FW_PUT_FUNCS(qpel, luma,   v )
+FW_PUT_FUNCS(qpel, luma,   hv)
+FW_PUT_FUNCS(epel, chroma, h )
+FW_PUT_FUNCS(epel, chroma, v )
+FW_PUT_FUNCS(epel, chroma, hv)
 
 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t 

[FFmpeg-devel] [PATCH v3 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c

2024-01-22 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/h26x/h2656_inter_template.c | 577 +
 libavcodec/vvc/vvc_inter_template.c| 559 +---
 2 files changed, 578 insertions(+), 558 deletions(-)
 create mode 100644 libavcodec/h26x/h2656_inter_template.c

diff --git a/libavcodec/h26x/h2656_inter_template.c 
b/libavcodec/h26x/h2656_inter_template.c
new file mode 100644
index 00..864f6c7e7d
--- /dev/null
+++ b/libavcodec/h26x/h2656_inter_template.c
@@ -0,0 +1,577 @@
+/*
+ * inter prediction template for HEVC/VVC
+ *
+ * Copyright (C) 2022 Nuo Mi
+ * Copyright (C) 2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CHROMA_EXTRA_BEFORE 1
+#define CHROMA_EXTRA3
+#define LUMA_EXTRA_BEFORE   3
+#define LUMA_EXTRA  7
+
+static void FUNC(put_pixels)(int16_t *dst,
+const uint8_t *_src, const ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = src[x] << (14 - BIT_DEPTH);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+ const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+memcpy(dst, src, width * sizeof(pixel));
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+const int denom, const int wx, const int _ox,  const int8_t *hf, const 
int8_t *vf,
+const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+const int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+const int offset= 1 << (shift - 1);
+#else
+const int offset= 0;
+#endif
+const int ox= _ox * (1 << (BIT_DEPTH - 8));
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++) {
+const int v = (src[x] << (14 - BIT_DEPTH));
+dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox);
+}
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+#define LUMA_FILTER(src, stride)   
\
+(filter[0] * src[x - 3 * stride] + 
\
+ filter[1] * src[x - 2 * stride] + 
\
+ filter[2] * src[x - stride] + 
\
+ filter[3] * src[x ] + 
\
+ filter[4] * src[x + stride] + 
\
+ filter[5] * src[x + 2 * stride] + 
\
+ filter[6] * src[x + 3 * stride] + 
\
+ filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src   = (const pixel*)_src;
+const ptrdiff_t src_stride = _src_stride / sizeof(pixel);
+const int8_t *filter   = hf;
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void 

[FFmpeg-devel] [PATCH v2 8/8] tests/checkasm/vvc_mc: add check_avg

2024-01-19 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_mc.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 711280deec..8adb00573f 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -35,6 +35,7 @@
 static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
 static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
 
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
 #define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
 #define EXTRA_BEFORE 3
 #define EXTRA_AFTER  4
@@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void)
 report("put_uni_chroma");
 }
 
+#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE)
+#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2)
+
+static void check_avg(void)
+{
+LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]);
+VVCDSPContext c;
+
+for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+ff_vvc_dsp_init(, bit_depth);
+for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) {
+for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) {
+{
+   declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height);
+if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, 
h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h);
+}
+}
+{
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height,
+int denom, int w0, int w1, int o0, int o1);
+{
+const int denom = rnd() % 8;
+const int w0= rnd() % 256 - 128;
+const int w1= rnd() % 256 - 128;
+const int o0= rnd() % 256 - 128;
+const int o1= rnd() % 256 - 128;
+if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", 
bit_depth, w, h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h, denom, w0, w1, o0, o1);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h, denom, w0, w1, o0, o1);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h, denom, w0, w1, o0, o1);
+}
+}
+}
+}
+}
+}
+report("avg");
+}
+
 void checkasm_check_vvc_mc(void)
 {
 check_put_vvc_luma();
 check_put_vvc_luma_uni();
 check_put_vvc_chroma();
 check_put_vvc_chroma_uni();
+check_avg();
 }
-- 
2.34.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations

2024-01-19 Thread toqsxw
From: Wu Jianhua 

The avg/avg_w is based on dav1d.
See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm

vvc_avg_8_2x2_c: 71.6
vvc_avg_8_2x2_avx2: 26.8
vvc_avg_8_2x4_c: 140.8
vvc_avg_8_2x4_avx2: 34.6
vvc_avg_8_2x8_c: 410.3
vvc_avg_8_2x8_avx2: 41.3
vvc_avg_8_2x16_c: 769.3
vvc_avg_8_2x16_avx2: 60.3
vvc_avg_8_2x32_c: 1669.6
vvc_avg_8_2x32_avx2: 105.1
vvc_avg_8_2x64_c: 1978.3
vvc_avg_8_2x64_avx2: 425.8
vvc_avg_8_2x128_c: 6536.8
vvc_avg_8_2x128_avx2: 1315.1
vvc_avg_8_4x2_c: 155.6
vvc_avg_8_4x2_avx2: 26.1
vvc_avg_8_4x4_c: 250.3
vvc_avg_8_4x4_avx2: 31.3
vvc_avg_8_4x8_c: 831.8
vvc_avg_8_4x8_avx2: 41.3
vvc_avg_8_4x16_c: 1461.1
vvc_avg_8_4x16_avx2: 57.1
vvc_avg_8_4x32_c: 2821.6
vvc_avg_8_4x32_avx2: 105.1
vvc_avg_8_4x64_c: 3615.8
vvc_avg_8_4x64_avx2: 412.6
vvc_avg_8_4x128_c: 11962.6
vvc_avg_8_4x128_avx2: 1274.3
vvc_avg_8_8x2_c: 215.8
vvc_avg_8_8x2_avx2: 29.1
vvc_avg_8_8x4_c: 430.6
vvc_avg_8_8x4_avx2: 37.6
vvc_avg_8_8x8_c: 1463.3
vvc_avg_8_8x8_avx2: 51.8
vvc_avg_8_8x16_c: 2630.1
vvc_avg_8_8x16_avx2: 97.6
vvc_avg_8_8x32_c: 5813.8
vvc_avg_8_8x32_avx2: 196.6
vvc_avg_8_8x64_c: 6687.3
vvc_avg_8_8x64_avx2: 487.8
vvc_avg_8_8x128_c: 13178.6
vvc_avg_8_8x128_avx2: 1290.6
vvc_avg_8_16x2_c: 443.8
vvc_avg_8_16x2_avx2: 28.3
vvc_avg_8_16x4_c: 1253.3
vvc_avg_8_16x4_avx2: 32.1
vvc_avg_8_16x8_c: 2236.3
vvc_avg_8_16x8_avx2: 44.3
vvc_avg_8_16x16_c: 5127.8
vvc_avg_8_16x16_avx2: 63.3
vvc_avg_8_16x32_c: 6573.3
vvc_avg_8_16x32_avx2: 223.6
vvc_avg_8_16x64_c: 30311.8
vvc_avg_8_16x64_avx2: 437.8
vvc_avg_8_16x128_c: 25693.3
vvc_avg_8_16x128_avx2: 1266.8
vvc_avg_8_32x2_c: 954.6
vvc_avg_8_32x2_avx2: 32.1
vvc_avg_8_32x4_c: 2359.6
vvc_avg_8_32x4_avx2: 39.6
vvc_avg_8_32x8_c: 5703.6
vvc_avg_8_32x8_avx2: 57.1
vvc_avg_8_32x16_c: 9967.6
vvc_avg_8_32x16_avx2: 107.1
vvc_avg_8_32x32_c: 21327.6
vvc_avg_8_32x32_avx2: 272.6
vvc_avg_8_32x64_c: 39240.8
vvc_avg_8_32x64_avx2: 529.6
vvc_avg_8_32x128_c: 52580.8
vvc_avg_8_32x128_avx2: 1338.8
vvc_avg_8_64x2_c: 1647.3
vvc_avg_8_64x2_avx2: 38.8
vvc_avg_8_64x4_c: 5130.1
vvc_avg_8_64x4_avx2: 58.8
vvc_avg_8_64x8_c: 6529.3
vvc_avg_8_64x8_avx2: 88.3
vvc_avg_8_64x16_c: 19913.6
vvc_avg_8_64x16_avx2: 162.3
vvc_avg_8_64x32_c: 39360.8
vvc_avg_8_64x32_avx2: 295.8
vvc_avg_8_64x64_c: 49658.3
vvc_avg_8_64x64_avx2: 784.1
vvc_avg_8_64x128_c: 108513.1
vvc_avg_8_64x128_avx2: 1977.1
vvc_avg_8_128x2_c: 3226.1
vvc_avg_8_128x2_avx2: 61.1
vvc_avg_8_128x4_c: 10280.3
vvc_avg_8_128x4_avx2: 94.6
vvc_avg_8_128x8_c: 18079.3
vvc_avg_8_128x8_avx2: 155.3
vvc_avg_8_128x16_c: 45121.8
vvc_avg_8_128x16_avx2: 285.3
vvc_avg_8_128x32_c: 48651.8
vvc_avg_8_128x32_avx2: 581.6
vvc_avg_8_128x64_c: 165078.6
vvc_avg_8_128x64_avx2: 1942.8
vvc_avg_8_128x128_c: 339103.1
vvc_avg_8_128x128_avx2: 4332.6
vvc_avg_10_2x2_c: 144.3
vvc_avg_10_2x2_avx2: 26.8
vvc_avg_10_2x4_c: 142.6
vvc_avg_10_2x4_avx2: 45.3
vvc_avg_10_2x8_c: 478.1
vvc_avg_10_2x8_avx2: 38.1
vvc_avg_10_2x16_c: 518.3
vvc_avg_10_2x16_avx2: 58.1
vvc_avg_10_2x32_c: 2059.8
vvc_avg_10_2x32_avx2: 93.1
vvc_avg_10_2x64_c: 2383.8
vvc_avg_10_2x64_avx2: 714.8
vvc_avg_10_2x128_c: 4498.3
vvc_avg_10_2x128_avx2: 1466.3
vvc_avg_10_4x2_c: 228.6
vvc_avg_10_4x2_avx2: 26.8
vvc_avg_10_4x4_c: 378.3
vvc_avg_10_4x4_avx2: 30.6
vvc_avg_10_4x8_c: 866.8
vvc_avg_10_4x8_avx2: 44.6
vvc_avg_10_4x16_c: 1018.1
vvc_avg_10_4x16_avx2: 58.1
vvc_avg_10_4x32_c: 3590.8
vvc_avg_10_4x32_avx2: 128.8
vvc_avg_10_4x64_c: 4200.8
vvc_avg_10_4x64_avx2: 663.6
vvc_avg_10_4x128_c: 8450.8
vvc_avg_10_4x128_avx2: 1531.8
vvc_avg_10_8x2_c: 369.3
vvc_avg_10_8x2_avx2: 28.3
vvc_avg_10_8x4_c: 513.8
vvc_avg_10_8x4_avx2: 32.1
vvc_avg_10_8x8_c: 1720.3
vvc_avg_10_8x8_avx2: 49.1
vvc_avg_10_8x16_c: 1894.8
vvc_avg_10_8x16_avx2: 71.6
vvc_avg_10_8x32_c: 3931.3
vvc_avg_10_8x32_avx2: 148.1
vvc_avg_10_8x64_c: 7964.3
vvc_avg_10_8x64_avx2: 613.1
vvc_avg_10_8x128_c: 15540.1
vvc_avg_10_8x128_avx2: 1585.1
vvc_avg_10_16x2_c: 877.3
vvc_avg_10_16x2_avx2: 27.6
vvc_avg_10_16x4_c: 955.8
vvc_avg_10_16x4_avx2: 29.8
vvc_avg_10_16x8_c: 3419.6
vvc_avg_10_16x8_avx2: 62.6
vvc_avg_10_16x16_c: 3826.8
vvc_avg_10_16x16_avx2: 54.3
vvc_avg_10_16x32_c: 7655.3
vvc_avg_10_16x32_avx2: 86.3
vvc_avg_10_16x64_c: 30011.1
vvc_avg_10_16x64_avx2: 692.6
vvc_avg_10_16x128_c: 47894.8
vvc_avg_10_16x128_avx2: 1580.3
vvc_avg_10_32x2_c: 944.3
vvc_avg_10_32x2_avx2: 29.8
vvc_avg_10_32x4_c: 2022.6
vvc_avg_10_32x4_avx2: 35.1
vvc_avg_10_32x8_c: 6148.8
vvc_avg_10_32x8_avx2: 51.3
vvc_avg_10_32x16_c: 12601.6
vvc_avg_10_32x16_avx2: 70.8
vvc_avg_10_32x32_c: 15958.6
vvc_avg_10_32x32_avx2: 124.3
vvc_avg_10_32x64_c: 31784.6
vvc_avg_10_32x64_avx2: 757.3
vvc_avg_10_32x128_c: 63892.8
vvc_avg_10_32x128_avx2: 1711.3
vvc_avg_10_64x2_c: 1890.8
vvc_avg_10_64x2_avx2: 34.3
vvc_avg_10_64x4_c: 6267.3
vvc_avg_10_64x4_avx2: 42.6
vvc_avg_10_64x8_c: 12778.1
vvc_avg_10_64x8_avx2: 67.8
vvc_avg_10_64x16_c: 22304.3
vvc_avg_10_64x16_avx2: 116.8
vvc_avg_10_64x32_c: 30777.1
vvc_avg_10_64x32_avx2: 201.1
vvc_avg_10_64x64_c: 60169.1
vvc_avg_10_64x64_avx2: 1454.3
vvc_avg_10_64x128_c: 124392.8
vvc_avg_10_64x128_avx2: 3648.6

[FFmpeg-devel] [PATCH v2 6/8] tests/checkasm: add checkasm_check_vvc_mc

2024-01-19 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_mc.c   | 270 ++
 4 files changed, 275 insertions(+)
 create mode 100644 tests/checkasm/vvc_mc.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3b5b54352b..3562acb2b2 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_mc.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 87f24c77ca..36a97957e5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -194,6 +194,9 @@ static const struct {
 #if CONFIG_VORBIS_DECODER
 { "vorbisdsp", checkasm_check_vorbisdsp },
 #endif
+#if CONFIG_VVC_DECODER
+{ "vvc_mc", checkasm_check_vvc_mc },
+#endif
 #endif
 #if CONFIG_AVFILTER
 #if CONFIG_AFIR_FILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4db8c495ea..53cb3ccfbf 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_mc(void);
 
 struct CheckasmPerf;
 
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
new file mode 100644
index 00..711280deec
--- /dev/null
+++ b/tests/checkasm/vvc_mc.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvc_data.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
+static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
+
+#define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
+#define EXTRA_BEFORE 3
+#define EXTRA_AFTER  4
+#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2
+#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA)
+#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2)
+#define SRC_OFFSET   ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE)
+
+#define randomize_buffers(buf0, buf1, size, mask)   \
+do {\
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+#define randomize_pixels(buf0, buf1, size)  \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+#define randomize_avg_src(buf0, buf1, size) \
+do {\
+uint32_t mask = 0x3fff3fff; \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+static void check_put_vvc_luma(void)
+{
+LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+VVCDSPContext c;
+
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t 
*dst, const uint8_t *src, const ptrdiff_t src_stride,
+  

[FFmpeg-devel] [PATCH v2 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations

2024-01-19 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/Makefile  |   1 +
 libavcodec/vvc/vvcdsp.c  |   4 +
 libavcodec/vvc/vvcdsp.h  |   2 +
 libavcodec/x86/vvc/Makefile  |   6 +
 libavcodec/x86/vvc/vvcdsp_init.c | 202 +++
 5 files changed, 215 insertions(+)
 create mode 100644 libavcodec/x86/vvc/Makefile
 create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb42095165..ce33631b60 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -65,6 +65,7 @@ OBJS = ac3_parser.o   
  \
 
 # subsystems
 include $(SRC_PATH)/libavcodec/vvc/Makefile
+include $(SRC_PATH)/libavcodec/x86/vvc/Makefile
 OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o
 OBJS-$(CONFIG_AC3DSP)  += ac3dsp.o ac3.o ac3tab.o
 OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o 
mpeg4audio_sample_rates.o
diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c
index c82ea7be30..c542be5258 100644
--- a/libavcodec/vvc/vvcdsp.c
+++ b/libavcodec/vvc/vvcdsp.c
@@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
 VVC_DSP(8);
 break;
 }
+
+#if ARCH_X86
+ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
+#endif
 }
diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h
index b5a63c5833..6f59e73654 100644
--- a/libavcodec/vvc/vvcdsp.h
+++ b/libavcodec/vvc/vvcdsp.h
@@ -167,4 +167,6 @@ typedef struct VVCDSPContext {
 
 void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
 
+void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
+
 #endif /* AVCODEC_VVC_VVCDSP_H */
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
new file mode 100644
index 00..b4acc22501
--- /dev/null
+++ b/libavcodec/x86/vvc/Makefile
@@ -0,0 +1,6 @@
+clean::
+   $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
+
+OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
+X86ASM-OBJS-$(CONFIG_VVC_DECODER)  += x86/h26x/h2656dsp.o   \
+   
  x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
new file mode 100644
index 00..c197cdb4cc
--- /dev/null
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -0,0 +1,202 @@
+/*
+ * VVC DSP init for x86
+ *
+ * Copyright (C) 2022-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vvc/vvcdec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvcdsp.h"
+#include "libavcodec/x86/h26x/h2656dsp.h"
+
+#define FW_PUT(name, depth, opt) \
+static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
+ int height, const int8_t *hf, 
const int8_t *vf, int width)\
+{  
\
+ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, 
srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_TAP(fname, bitd, opt ) \
+FW_PUT(fname##4,   bitd, opt );   \
+FW_PUT(fname##8,   bitd, opt );   \
+FW_PUT(fname##16,  bitd, opt );   \
+FW_PUT(fname##32,  bitd, opt );   \
+FW_PUT(fname##64,  bitd, opt );   \
+FW_PUT(fname##128, bitd, opt );   \
+
+#define FW_PUT_4TAP(fname, bitd, opt) \
+FW_PUT(fname ## 2, bitd, opt) \
+FW_PUT_TAP(fname,  bitd, opt)
+
+#define FW_PUT_4TAP_SSE4(bitd)   \
+FW_PUT_4TAP(pixels,  bitd, sse4) \
+FW_PUT_4TAP(4tap_h,  bitd, sse4) \
+FW_PUT_4TAP(4tap_v,  bitd, sse4) \
+FW_PUT_4TAP(4tap_hv, bitd, sse4)
+
+#define FW_PUT_8TAP_SSE4(bitd)  \
+FW_PUT_TAP(8tap_h,  bitd, sse4) \
+FW_PUT_TAP(8tap_v,  bitd, sse4) \
+FW_PUT_TAP(8tap_hv, bitd, sse4)
+
+#define FW_PUT_SSE4(bitd)  \
+FW_PUT_4TAP_SSE4(bitd) \
+FW_PUT_8TAP_SSE4(bitd)
+
+FW_PUT_SSE4( 8);
+FW_PUT_SSE4(10);
+FW_PUT_SSE4(12);
+
+#define 

[FFmpeg-devel] [PATCH v2 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c

2024-01-19 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/h26x/h2656_inter_template.c | 577 +
 libavcodec/vvc/vvc_inter_template.c| 559 +---
 2 files changed, 578 insertions(+), 558 deletions(-)
 create mode 100644 libavcodec/h26x/h2656_inter_template.c

diff --git a/libavcodec/h26x/h2656_inter_template.c 
b/libavcodec/h26x/h2656_inter_template.c
new file mode 100644
index 00..864f6c7e7d
--- /dev/null
+++ b/libavcodec/h26x/h2656_inter_template.c
@@ -0,0 +1,577 @@
+/*
+ * inter prediction template for HEVC/VVC
+ *
+ * Copyright (C) 2022 Nuo Mi
+ * Copyright (C) 2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CHROMA_EXTRA_BEFORE 1
+#define CHROMA_EXTRA3
+#define LUMA_EXTRA_BEFORE   3
+#define LUMA_EXTRA  7
+
+static void FUNC(put_pixels)(int16_t *dst,
+const uint8_t *_src, const ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = src[x] << (14 - BIT_DEPTH);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+ const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+memcpy(dst, src, width * sizeof(pixel));
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+const int denom, const int wx, const int _ox,  const int8_t *hf, const 
int8_t *vf,
+const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+const int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+const int offset= 1 << (shift - 1);
+#else
+const int offset= 0;
+#endif
+const int ox= _ox * (1 << (BIT_DEPTH - 8));
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++) {
+const int v = (src[x] << (14 - BIT_DEPTH));
+dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox);
+}
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+#define LUMA_FILTER(src, stride)   
\
+(filter[0] * src[x - 3 * stride] + 
\
+ filter[1] * src[x - 2 * stride] + 
\
+ filter[2] * src[x - stride] + 
\
+ filter[3] * src[x ] + 
\
+ filter[4] * src[x + stride] + 
\
+ filter[5] * src[x + 2 * stride] + 
\
+ filter[6] * src[x + 3 * stride] + 
\
+ filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src   = (const pixel*)_src;
+const ptrdiff_t src_stride = _src_stride / sizeof(pixel);
+const int8_t *filter   = hf;
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void 

[FFmpeg-devel] [PATCH v2 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put

2024-01-19 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/h26x/h2656_inter.asm | 32 ++---
 libavcodec/x86/h26x/h2656dsp.c  |  4 ++--
 libavcodec/x86/h26x/h2656dsp.h  |  2 +-
 libavcodec/x86/hevcdsp_init.c   |  2 +-
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index 4316c8ae3d..68f88832a6 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -22,8 +22,6 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 
-%define MAX_PB_SIZE 64
-
 SECTION_RODATA 32
 cextern pw_255
 cextern pw_512
@@ -332,7 +330,7 @@ SECTION .text
 %endmacro
 
 %macro LOOP_END 3
-add  %1q, 2*MAX_PB_SIZE  ; dst += dststride
+add  %1q, dststrideq ; dst += dststride
 add  %2q, %3q; src += srcstride
 dec  heightd ; cmp height
 jnz   .loop  ; height loop
@@ -529,7 +527,7 @@ SECTION .text
 
 
 ; **
-; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ; int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
@@ -539,7 +537,7 @@ SECTION .text
 %endmacro
 
 %macro MC_PIXELS 3
-cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height
+cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height
 pxor  m2, m2
 .loop:
 SIMPLE_LOAD   %2, %3, srcq, m0
@@ -569,10 +567,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, 
src, srcstride, height
 %endif
 
 ; **
-; void %1_put_4tap_hX(int16_t *dst,
+; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width);
 ; **
-cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf
+cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, 
height, hf
 %assign %%stride ((%3 + 7)/8)
 MC_4TAP_FILTER   %3, hf, m4, m5
 .loop:
@@ -602,10 +600,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 RET
 
 ; **
-; void %1_put_4tap_v(int16_t *dst,
+; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, 
r3src, vf
+cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, 
height, r3src, vf
 sub srcq, srcstrideq
 MC_4TAP_FILTER%3, vf, m4, m5
 lea   r3srcq, [srcstrideq*3]
@@ -639,10 +637,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 
 %macro PUT_4TAP_HV 3
 ; **
-; void put_4tap_hv(int16_t *dst,
+; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, 
r3src
+cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, 
height, hf, vf, r3src
 %assign %%stride ((%3 + 7)/8)
 sub srcq, srcstrideq
 MC_4TAP_HV_FILTER%3
@@ -774,12 +772,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, 
dststride, src, srcstride, heig
 %endmacro
 
 ; **
-; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;   int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
 %macro PUT_8TAP 3
-cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf
+cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, 
hf
 MC_8TAP_FILTER  %3, hf
 .loop:
 MC_8TAP_H_LOAD  %3, srcq, %2, 10
@@ -814,10 +812,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, 
dststride, src, srcstride, heigh
 
 
 ; **
-; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;  int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
-cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf
+cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height, 

[FFmpeg-devel] [PATCH v2 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm

2024-01-19 Thread toqsxw
From: Wu Jianhua 

This enable that the asm optimization can be reused by VVC

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/Makefile |1 +
 libavcodec/x86/h26x/h2656_inter.asm | 1135 +++
 libavcodec/x86/h26x/h2656dsp.c  |   98 +++
 libavcodec/x86/h26x/h2656dsp.h  |  103 +++
 libavcodec/x86/hevc_mc.asm  |  462 +--
 libavcodec/x86/hevcdsp_init.c   |  108 ++-
 6 files changed, 1461 insertions(+), 446 deletions(-)
 create mode 100644 libavcodec/x86/h26x/h2656_inter.asm
 create mode 100644 libavcodec/x86/h26x/h2656dsp.c
 create mode 100644 libavcodec/x86/h26x/h2656dsp.h

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d5fb30645a..8098cd840c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += 
x86/hevc_add_res.o\
   x86/hevc_deblock.o\
   x86/hevc_idct.o   \
   x86/hevc_mc.o \
+  x86/h26x/h2656_inter.o\
   x86/hevc_sao.o\
   x86/hevc_sao_10bit.o
 X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
new file mode 100644
index 00..4316c8ae3d
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -0,0 +1,1135 @@
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 64
+
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+cextern pw_8192
+%define scale_8 pw_512
+%define scale_10 pw_2048
+%define scale_12 pw_8192
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+max_pixels_12:  times 16 dw ((1 << 12)-1)
+cextern pb_0
+
+SECTION .text
+%macro SIMPLE_LOAD 4;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+movd  %4, [%3]   ; 
load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+movq  %4, [%3]   ; 
load data from source
+%elif notcpuflag(avx)
+movu  %4, [%3]   ; 
load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+movdqu   %4, [%3]
+%else
+movu  %4, [%3]
+%endif
+%endmacro
+
+%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b,
+vpbroadcastw   %3, [%2q + 0 * 2]  ; coeff 0, 1
+vpbroadcastw   %4, [%2q + 1 * 2]  ; coeff 2, 3
+%if %1 != 8
+pmovsxbw   %3, xmm%3
+pmovsxbw   %4, xmm%4
+%endif
+%endmacro
+
+%macro MC_4TAP_HV_FILTER 1
+vpbroadcastw  m12, [vfq + 0 * 2]  ; vf 0, 1
+vpbroadcastw  m13, [vfq + 1 * 2]  ; vf 2, 3
+vpbroadcastw  m14, [hfq + 0 * 2]  ; hf 0, 1
+vpbroadcastw  m15, [hfq + 1 * 2]  ; hf 2, 3
+
+pmovsxbw  m12, xm12
+pmovsxbw  m13, xm13
+%if %1 != 8
+pmovsxbw  m14, xm14
+pmovsxbw  m15, xm15
+%endif
+lea   r3srcq, [srcstrideq*3]
+%endmacro
+
+%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers
+mova [rsp + %1 + 0*mmsize], %2
+mova [rsp + %1 + 1*mmsize], %3
+mova [rsp + %1 + 2*mmsize], %4
+mova [rsp + %1 + 3*mmsize], %5
+%endmacro
+
+%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset
+vpbroadcastw  m12, [%2q + 0 * 2]  ; coeff 0, 1
+vpbroadcastw  m13, [%2q + 1 * 2]  ; coeff 2, 3
+vpbroadcastw  m14, [%2q + 2 * 2]  ; coeff 4, 5
+vpbroadcastw  m15, [%2q + 3 * 2]  ; coeff 6, 7
+%if %0 == 3
+MC_8TAP_SAVE_FILTER%3, m12, m13, m14, m15
+%endif
+
+%if %1 != 8
+pmovsxbw  m12, xm12
+pmovsxbw 

[FFmpeg-devel] [PATCH v2 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template

2024-01-19 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/hevcdsp_template.c | 594 +++---
 1 file changed, 46 insertions(+), 548 deletions(-)

diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 0de14e9dcf..9b48bdf08e 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -26,6 +26,7 @@
 #include "bit_depth_template.c"
 #include "hevcdsp.h"
 #include "h26x/h2656_sao_template.c"
+#include "h26x/h2656_inter_template.c"
 
 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int 
height,
   GetBitContext *gb, int pcm_bit_depth)
@@ -299,37 +300,51 @@ IDCT_DC(32)
 

 //
 

-static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-  const uint8_t *_src, ptrdiff_t 
_srcstride,
-  int height, intptr_t mx, intptr_t my, 
int width)
-{
-int x, y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-for (x = 0; x < width; x++)
-dst[x] = src[x] << (14 - BIT_DEPTH);
-src += srcstride;
-dst += MAX_PB_SIZE;
-}
-}
-
-static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, 
const uint8_t *_src, ptrdiff_t _srcstride,
-  int height, intptr_t mx, intptr_t 
my, int width)
-{
-int y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-pixel *dst  = (pixel *)_dst;
-ptrdiff_t dststride = _dststride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-memcpy(dst, src, width * sizeof(pixel));
-src += srcstride;
-dst += dststride;
-}
-}
+#define ff_hevc_pel_filters ff_hevc_qpel_filters
+#define DECL_HV_FILTER(f)  \
+const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
+const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
+
+#define FW_PUT(p, f, t)
   \
+static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t 
srcstride, int height,\
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width);   
   \
+}
+
+#define FW_PUT_UNI(p, f, t)
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride, int height, intptr_t 
mx, intptr_t my, int width)   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width);
   \
+}
+
+#define FW_PUT_UNI_W(p, f, t)  
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride,int height, int denom, 
int wx, int ox,  \
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, 
vf, width);\
+}
+
+#define FW_PUT_FUNCS(f, t, dir)   \
+FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \
+FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\
+FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir)
+
+FW_PUT(pel, pel_pixels, pixels)
+FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels)
+FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels)
+
+FW_PUT_FUNCS(qpel, luma,   h )
+FW_PUT_FUNCS(qpel, luma,   v )
+FW_PUT_FUNCS(qpel, luma,   hv)
+FW_PUT_FUNCS(epel, chroma, h )
+FW_PUT_FUNCS(epel, chroma, v )
+FW_PUT_FUNCS(epel, chroma, hv)
 
 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t 

[FFmpeg-devel] [PATCH 5/8] avcodec/vvcdec: reuse h26x/2656_inter.asm to enable x86 optimizations

2024-01-18 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/Makefile  |   1 +
 libavcodec/vvc/vvcdsp.c  |   4 +
 libavcodec/vvc/vvcdsp.h  |   2 +
 libavcodec/x86/vvc/Makefile  |   6 +
 libavcodec/x86/vvc/vvcdsp_init.c | 200 +++
 5 files changed, 213 insertions(+)
 create mode 100644 libavcodec/x86/vvc/Makefile
 create mode 100644 libavcodec/x86/vvc/vvcdsp_init.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index bb42095165..ce33631b60 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -65,6 +65,7 @@ OBJS = ac3_parser.o   
  \
 
 # subsystems
 include $(SRC_PATH)/libavcodec/vvc/Makefile
+include $(SRC_PATH)/libavcodec/x86/vvc/Makefile
 OBJS-$(CONFIG_AANDCTTABLES)+= aandcttab.o
 OBJS-$(CONFIG_AC3DSP)  += ac3dsp.o ac3.o ac3tab.o
 OBJS-$(CONFIG_ADTS_HEADER) += adts_header.o 
mpeg4audio_sample_rates.o
diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c
index c82ea7be30..c542be5258 100644
--- a/libavcodec/vvc/vvcdsp.c
+++ b/libavcodec/vvc/vvcdsp.c
@@ -138,4 +138,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth)
 VVC_DSP(8);
 break;
 }
+
+#if ARCH_X86
+ff_vvc_dsp_init_x86(vvcdsp, bit_depth);
+#endif
 }
diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h
index b5a63c5833..6f59e73654 100644
--- a/libavcodec/vvc/vvcdsp.h
+++ b/libavcodec/vvc/vvcdsp.h
@@ -167,4 +167,6 @@ typedef struct VVCDSPContext {
 
 void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth);
 
+void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth);
+
 #endif /* AVCODEC_VVC_VVCDSP_H */
diff --git a/libavcodec/x86/vvc/Makefile b/libavcodec/x86/vvc/Makefile
new file mode 100644
index 00..b4acc22501
--- /dev/null
+++ b/libavcodec/x86/vvc/Makefile
@@ -0,0 +1,6 @@
+clean::
+   $(RM) $(CLEANSUFFIXES:%=libavcodec/x86/vvc/%)
+
+OBJS-$(CONFIG_VVC_DECODER) += x86/vvc/vvcdsp_init.o
+X86ASM-OBJS-$(CONFIG_VVC_DECODER)  += x86/h26x/h2656dsp.o   \
+   
  x86/h26x/h2656_inter.o
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
new file mode 100644
index 00..69bbd07c80
--- /dev/null
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -0,0 +1,200 @@
+/*
+ * VVC DSP init for x86
+ *
+ * Copyright (C) 2022-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/vvc/vvcdec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvcdsp.h"
+#include "libavcodec/x86/h26x/h2656dsp.h"
+
+#define FW_PUT(name, depth, opt) \
+static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const 
uint8_t *src, ptrdiff_t srcstride, \
+ int height, const int8_t *hf, 
const int8_t *vf, int width)\
+{  
\
+ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, 
srcstride, height, hf, vf, width); \
+}
+
+#define FW_PUT_TAP(fname, bitd, opt ) \
+FW_PUT(fname##4,   bitd, opt );   \
+FW_PUT(fname##8,   bitd, opt );   \
+FW_PUT(fname##16,  bitd, opt );   \
+FW_PUT(fname##32,  bitd, opt );   \
+FW_PUT(fname##64,  bitd, opt );   \
+FW_PUT(fname##128, bitd, opt );   \
+
+#define FW_PUT_4TAP(fname, bitd, opt) \
+FW_PUT(fname ## 2, bitd, opt) \
+FW_PUT_TAP(fname,  bitd, opt)
+
+#define FW_PUT_4TAP_SSE4(bitd)   \
+FW_PUT_4TAP(pixels,  bitd, sse4) \
+FW_PUT_4TAP(4tap_h,  bitd, sse4) \
+FW_PUT_4TAP(4tap_v,  bitd, sse4) \
+FW_PUT_4TAP(4tap_hv, bitd, sse4)
+
+#define FW_PUT_8TAP_SSE4(bitd)  \
+FW_PUT_TAP(8tap_h,  bitd, sse4) \
+FW_PUT_TAP(8tap_v,  bitd, sse4) \
+FW_PUT_TAP(8tap_hv, bitd, sse4)
+
+#define FW_PUT_SSE4(bitd)  \
+FW_PUT_4TAP_SSE4(bitd) \
+FW_PUT_8TAP_SSE4(bitd)
+
+FW_PUT_SSE4( 8);
+FW_PUT_SSE4(10);
+FW_PUT_SSE4(12);
+
+#define 

[FFmpeg-devel] [PATCH 8/8] tests/checkasm/vvc_mc: add check_avg

2024-01-18 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/vvc_mc.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
index 711280deec..8adb00573f 100644
--- a/tests/checkasm/vvc_mc.c
+++ b/tests/checkasm/vvc_mc.c
@@ -35,6 +35,7 @@
 static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
 static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
 
+#define SIZEOF_PIXEL ((bit_depth + 7) / 8)
 #define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
 #define EXTRA_BEFORE 3
 #define EXTRA_AFTER  4
@@ -261,10 +262,73 @@ static void check_put_vvc_chroma_uni(void)
 report("put_uni_chroma");
 }
 
+#define AVG_SRC_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE)
+#define AVG_DST_BUF_SIZE (MAX_PB_SIZE * MAX_PB_SIZE * 2)
+
+static void check_avg(void)
+{
+LOCAL_ALIGNED_32(int16_t, src00, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src01, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src10, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(int16_t, src11, [AVG_SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst0, [AVG_DST_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, dst1, [AVG_DST_BUF_SIZE]);
+VVCDSPContext c;
+
+for (int bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
+randomize_avg_src((uint8_t*)src00, (uint8_t*)src10, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+randomize_avg_src((uint8_t*)src01, (uint8_t*)src11, AVG_SRC_BUF_SIZE * 
sizeof(int16_t));
+ff_vvc_dsp_init(, bit_depth);
+for (int h = 2; h <= MAX_CTU_SIZE; h *= 2) {
+for (int w = 2; w <= MAX_CTU_SIZE; w *= 2) {
+{
+   declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height);
+if (check_func(c.inter.avg, "avg_%d_%dx%d", bit_depth, w, 
h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h);
+}
+}
+{
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, 
void, uint8_t *dst, ptrdiff_t dst_stride,
+const int16_t *src0, const int16_t *src1, int width, 
int height,
+int denom, int w0, int w1, int o0, int o1);
+{
+const int denom = rnd() % 8;
+const int w0= rnd() % 256 - 128;
+const int w1= rnd() % 256 - 128;
+const int o0= rnd() % 256 - 128;
+const int o1= rnd() % 256 - 128;
+if (check_func(c.inter.w_avg, "w_avg_%d_%dx%d", 
bit_depth, w, h)) {
+memset(dst0, 0, AVG_DST_BUF_SIZE);
+memset(dst1, 0, AVG_DST_BUF_SIZE);
+
+call_ref(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, src00, 
src01, w, h, denom, w0, w1, o0, o1);
+call_new(dst1, MAX_CTU_SIZE * SIZEOF_PIXEL, src10, 
src11, w, h, denom, w0, w1, o0, o1);
+if (memcmp(dst0, dst1, DST_BUF_SIZE))
+fail();
+if (w == h)
+bench_new(dst0, MAX_CTU_SIZE * SIZEOF_PIXEL, 
src00, src01, w, h, denom, w0, w1, o0, o1);
+}
+}
+}
+}
+}
+}
+report("avg");
+}
+
 void checkasm_check_vvc_mc(void)
 {
 check_put_vvc_luma();
 check_put_vvc_luma_uni();
 check_put_vvc_chroma();
 check_put_vvc_chroma_uni();
+check_avg();
 }
-- 
2.34.1

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 7/8] avcodec/x86/vvc: add avg and avg_w AVX2 optimizations

2024-01-18 Thread toqsxw
From: Wu Jianhua 

The avg/avg_w is based on dav1d.
See https://code.videolan.org/videolan/dav1d/-/blob/master/src/x86/mc_avx2.asm

vvc_avg_8_2x2_c: 71.6
vvc_avg_8_2x2_avx2: 26.8
vvc_avg_8_2x4_c: 140.8
vvc_avg_8_2x4_avx2: 34.6
vvc_avg_8_2x8_c: 410.3
vvc_avg_8_2x8_avx2: 41.3
vvc_avg_8_2x16_c: 769.3
vvc_avg_8_2x16_avx2: 60.3
vvc_avg_8_2x32_c: 1669.6
vvc_avg_8_2x32_avx2: 105.1
vvc_avg_8_2x64_c: 1978.3
vvc_avg_8_2x64_avx2: 425.8
vvc_avg_8_2x128_c: 6536.8
vvc_avg_8_2x128_avx2: 1315.1
vvc_avg_8_4x2_c: 155.6
vvc_avg_8_4x2_avx2: 26.1
vvc_avg_8_4x4_c: 250.3
vvc_avg_8_4x4_avx2: 31.3
vvc_avg_8_4x8_c: 831.8
vvc_avg_8_4x8_avx2: 41.3
vvc_avg_8_4x16_c: 1461.1
vvc_avg_8_4x16_avx2: 57.1
vvc_avg_8_4x32_c: 2821.6
vvc_avg_8_4x32_avx2: 105.1
vvc_avg_8_4x64_c: 3615.8
vvc_avg_8_4x64_avx2: 412.6
vvc_avg_8_4x128_c: 11962.6
vvc_avg_8_4x128_avx2: 1274.3
vvc_avg_8_8x2_c: 215.8
vvc_avg_8_8x2_avx2: 29.1
vvc_avg_8_8x4_c: 430.6
vvc_avg_8_8x4_avx2: 37.6
vvc_avg_8_8x8_c: 1463.3
vvc_avg_8_8x8_avx2: 51.8
vvc_avg_8_8x16_c: 2630.1
vvc_avg_8_8x16_avx2: 97.6
vvc_avg_8_8x32_c: 5813.8
vvc_avg_8_8x32_avx2: 196.6
vvc_avg_8_8x64_c: 6687.3
vvc_avg_8_8x64_avx2: 487.8
vvc_avg_8_8x128_c: 13178.6
vvc_avg_8_8x128_avx2: 1290.6
vvc_avg_8_16x2_c: 443.8
vvc_avg_8_16x2_avx2: 28.3
vvc_avg_8_16x4_c: 1253.3
vvc_avg_8_16x4_avx2: 32.1
vvc_avg_8_16x8_c: 2236.3
vvc_avg_8_16x8_avx2: 44.3
vvc_avg_8_16x16_c: 5127.8
vvc_avg_8_16x16_avx2: 63.3
vvc_avg_8_16x32_c: 6573.3
vvc_avg_8_16x32_avx2: 223.6
vvc_avg_8_16x64_c: 30311.8
vvc_avg_8_16x64_avx2: 437.8
vvc_avg_8_16x128_c: 25693.3
vvc_avg_8_16x128_avx2: 1266.8
vvc_avg_8_32x2_c: 954.6
vvc_avg_8_32x2_avx2: 32.1
vvc_avg_8_32x4_c: 2359.6
vvc_avg_8_32x4_avx2: 39.6
vvc_avg_8_32x8_c: 5703.6
vvc_avg_8_32x8_avx2: 57.1
vvc_avg_8_32x16_c: 9967.6
vvc_avg_8_32x16_avx2: 107.1
vvc_avg_8_32x32_c: 21327.6
vvc_avg_8_32x32_avx2: 272.6
vvc_avg_8_32x64_c: 39240.8
vvc_avg_8_32x64_avx2: 529.6
vvc_avg_8_32x128_c: 52580.8
vvc_avg_8_32x128_avx2: 1338.8
vvc_avg_8_64x2_c: 1647.3
vvc_avg_8_64x2_avx2: 38.8
vvc_avg_8_64x4_c: 5130.1
vvc_avg_8_64x4_avx2: 58.8
vvc_avg_8_64x8_c: 6529.3
vvc_avg_8_64x8_avx2: 88.3
vvc_avg_8_64x16_c: 19913.6
vvc_avg_8_64x16_avx2: 162.3
vvc_avg_8_64x32_c: 39360.8
vvc_avg_8_64x32_avx2: 295.8
vvc_avg_8_64x64_c: 49658.3
vvc_avg_8_64x64_avx2: 784.1
vvc_avg_8_64x128_c: 108513.1
vvc_avg_8_64x128_avx2: 1977.1
vvc_avg_8_128x2_c: 3226.1
vvc_avg_8_128x2_avx2: 61.1
vvc_avg_8_128x4_c: 10280.3
vvc_avg_8_128x4_avx2: 94.6
vvc_avg_8_128x8_c: 18079.3
vvc_avg_8_128x8_avx2: 155.3
vvc_avg_8_128x16_c: 45121.8
vvc_avg_8_128x16_avx2: 285.3
vvc_avg_8_128x32_c: 48651.8
vvc_avg_8_128x32_avx2: 581.6
vvc_avg_8_128x64_c: 165078.6
vvc_avg_8_128x64_avx2: 1942.8
vvc_avg_8_128x128_c: 339103.1
vvc_avg_8_128x128_avx2: 4332.6
vvc_avg_10_2x2_c: 144.3
vvc_avg_10_2x2_avx2: 26.8
vvc_avg_10_2x4_c: 142.6
vvc_avg_10_2x4_avx2: 45.3
vvc_avg_10_2x8_c: 478.1
vvc_avg_10_2x8_avx2: 38.1
vvc_avg_10_2x16_c: 518.3
vvc_avg_10_2x16_avx2: 58.1
vvc_avg_10_2x32_c: 2059.8
vvc_avg_10_2x32_avx2: 93.1
vvc_avg_10_2x64_c: 2383.8
vvc_avg_10_2x64_avx2: 714.8
vvc_avg_10_2x128_c: 4498.3
vvc_avg_10_2x128_avx2: 1466.3
vvc_avg_10_4x2_c: 228.6
vvc_avg_10_4x2_avx2: 26.8
vvc_avg_10_4x4_c: 378.3
vvc_avg_10_4x4_avx2: 30.6
vvc_avg_10_4x8_c: 866.8
vvc_avg_10_4x8_avx2: 44.6
vvc_avg_10_4x16_c: 1018.1
vvc_avg_10_4x16_avx2: 58.1
vvc_avg_10_4x32_c: 3590.8
vvc_avg_10_4x32_avx2: 128.8
vvc_avg_10_4x64_c: 4200.8
vvc_avg_10_4x64_avx2: 663.6
vvc_avg_10_4x128_c: 8450.8
vvc_avg_10_4x128_avx2: 1531.8
vvc_avg_10_8x2_c: 369.3
vvc_avg_10_8x2_avx2: 28.3
vvc_avg_10_8x4_c: 513.8
vvc_avg_10_8x4_avx2: 32.1
vvc_avg_10_8x8_c: 1720.3
vvc_avg_10_8x8_avx2: 49.1
vvc_avg_10_8x16_c: 1894.8
vvc_avg_10_8x16_avx2: 71.6
vvc_avg_10_8x32_c: 3931.3
vvc_avg_10_8x32_avx2: 148.1
vvc_avg_10_8x64_c: 7964.3
vvc_avg_10_8x64_avx2: 613.1
vvc_avg_10_8x128_c: 15540.1
vvc_avg_10_8x128_avx2: 1585.1
vvc_avg_10_16x2_c: 877.3
vvc_avg_10_16x2_avx2: 27.6
vvc_avg_10_16x4_c: 955.8
vvc_avg_10_16x4_avx2: 29.8
vvc_avg_10_16x8_c: 3419.6
vvc_avg_10_16x8_avx2: 62.6
vvc_avg_10_16x16_c: 3826.8
vvc_avg_10_16x16_avx2: 54.3
vvc_avg_10_16x32_c: 7655.3
vvc_avg_10_16x32_avx2: 86.3
vvc_avg_10_16x64_c: 30011.1
vvc_avg_10_16x64_avx2: 692.6
vvc_avg_10_16x128_c: 47894.8
vvc_avg_10_16x128_avx2: 1580.3
vvc_avg_10_32x2_c: 944.3
vvc_avg_10_32x2_avx2: 29.8
vvc_avg_10_32x4_c: 2022.6
vvc_avg_10_32x4_avx2: 35.1
vvc_avg_10_32x8_c: 6148.8
vvc_avg_10_32x8_avx2: 51.3
vvc_avg_10_32x16_c: 12601.6
vvc_avg_10_32x16_avx2: 70.8
vvc_avg_10_32x32_c: 15958.6
vvc_avg_10_32x32_avx2: 124.3
vvc_avg_10_32x64_c: 31784.6
vvc_avg_10_32x64_avx2: 757.3
vvc_avg_10_32x128_c: 63892.8
vvc_avg_10_32x128_avx2: 1711.3
vvc_avg_10_64x2_c: 1890.8
vvc_avg_10_64x2_avx2: 34.3
vvc_avg_10_64x4_c: 6267.3
vvc_avg_10_64x4_avx2: 42.6
vvc_avg_10_64x8_c: 12778.1
vvc_avg_10_64x8_avx2: 67.8
vvc_avg_10_64x16_c: 22304.3
vvc_avg_10_64x16_avx2: 116.8
vvc_avg_10_64x32_c: 30777.1
vvc_avg_10_64x32_avx2: 201.1
vvc_avg_10_64x64_c: 60169.1
vvc_avg_10_64x64_avx2: 1454.3
vvc_avg_10_64x128_c: 124392.8
vvc_avg_10_64x128_avx2: 3648.6

[FFmpeg-devel] [PATCH 6/8] tests/checkasm: add checkasm_check_vvc_mc

2024-01-18 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 tests/checkasm/Makefile   |   1 +
 tests/checkasm/checkasm.c |   3 +
 tests/checkasm/checkasm.h |   1 +
 tests/checkasm/vvc_mc.c   | 270 ++
 4 files changed, 275 insertions(+)
 create mode 100644 tests/checkasm/vvc_mc.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3b5b54352b..3562acb2b2 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -40,6 +40,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)  += v210dec.o
 AVCODECOBJS-$(CONFIG_V210_ENCODER)  += v210enc.o
 AVCODECOBJS-$(CONFIG_VORBIS_DECODER)+= vorbisdsp.o
 AVCODECOBJS-$(CONFIG_VP9_DECODER)   += vp9dsp.o
+AVCODECOBJS-$(CONFIG_VVC_DECODER)   += vvc_mc.o
 
 CHECKASMOBJS-$(CONFIG_AVCODEC)  += $(AVCODECOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 87f24c77ca..36a97957e5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -194,6 +194,9 @@ static const struct {
 #if CONFIG_VORBIS_DECODER
 { "vorbisdsp", checkasm_check_vorbisdsp },
 #endif
+#if CONFIG_VVC_DECODER
+{ "vvc_mc", checkasm_check_vvc_mc },
+#endif
 #endif
 #if CONFIG_AVFILTER
 #if CONFIG_AFIR_FILTER
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 4db8c495ea..53cb3ccfbf 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -131,6 +131,7 @@ void checkasm_check_vp8dsp(void);
 void checkasm_check_vp9dsp(void);
 void checkasm_check_videodsp(void);
 void checkasm_check_vorbisdsp(void);
+void checkasm_check_vvc_mc(void);
 
 struct CheckasmPerf;
 
diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c
new file mode 100644
index 00..711280deec
--- /dev/null
+++ b/tests/checkasm/vvc_mc.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2023-2024 Nuo Mi
+ * Copyright (c) 2023-2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "checkasm.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vvc/vvc_ctu.h"
+#include "libavcodec/vvc/vvc_data.h"
+
+#include "libavutil/common.h"
+#include "libavutil/internal.h"
+#include "libavutil/internal.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+static const uint32_t pixel_mask[] = { 0x, 0x03ff03ff, 0x0fff0fff, 
0x3fff3fff, 0x };
+static const int sizes[] = { 2, 4, 8, 16, 32, 64, 128 };
+
+#define PIXEL_STRIDE (MAX_CTU_SIZE * 2)
+#define EXTRA_BEFORE 3
+#define EXTRA_AFTER  4
+#define SRC_EXTRA(EXTRA_BEFORE + EXTRA_AFTER) * 2
+#define SRC_BUF_SIZE (PIXEL_STRIDE + SRC_EXTRA) * (PIXEL_STRIDE + SRC_EXTRA)
+#define DST_BUF_SIZE (MAX_CTU_SIZE * MAX_CTU_SIZE * 2)
+#define SRC_OFFSET   ((PIXEL_STRIDE + EXTRA_BEFORE * 2) * EXTRA_BEFORE)
+
+#define randomize_buffers(buf0, buf1, size, mask)   \
+do {\
+int k;  \
+for (k = 0; k < size; k += 4) { \
+uint32_t r = rnd() & mask;  \
+AV_WN32A(buf0 + k, r);  \
+AV_WN32A(buf1 + k, r);  \
+}   \
+} while (0)
+
+#define randomize_pixels(buf0, buf1, size)  \
+do {\
+uint32_t mask = pixel_mask[(bit_depth - 8) >> 1];   \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+#define randomize_avg_src(buf0, buf1, size) \
+do {\
+uint32_t mask = 0x3fff3fff; \
+randomize_buffers(buf0, buf1, size, mask);  \
+} while (0)
+
+static void check_put_vvc_luma(void)
+{
+LOCAL_ALIGNED_32(int16_t, dst0, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(int16_t, dst1, [DST_BUF_SIZE / 2]);
+LOCAL_ALIGNED_32(uint8_t, src0, [SRC_BUF_SIZE]);
+LOCAL_ALIGNED_32(uint8_t, src1, [SRC_BUF_SIZE]);
+VVCDSPContext c;
+
+declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, int16_t 
*dst, const uint8_t *src, const ptrdiff_t src_stride,
+  

[FFmpeg-devel] [PATCH 3/8] avcodec/x86/hevc_mc: move put/put_uni to h26x/h2656_inter.asm

2024-01-18 Thread toqsxw
From: Wu Jianhua 

This enable that the asm optimization can be reused by VVC

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/Makefile |1 +
 libavcodec/x86/h26x/h2656_inter.asm | 1135 +++
 libavcodec/x86/h26x/h2656dsp.c  |   98 +++
 libavcodec/x86/h26x/h2656dsp.h  |  105 +++
 libavcodec/x86/hevc_mc.asm  |  462 +--
 libavcodec/x86/hevcdsp_init.c   |  108 ++-
 6 files changed, 1463 insertions(+), 446 deletions(-)
 create mode 100644 libavcodec/x86/h26x/h2656_inter.asm
 create mode 100644 libavcodec/x86/h26x/h2656dsp.c
 create mode 100644 libavcodec/x86/h26x/h2656dsp.h

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index d5fb30645a..8098cd840c 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -167,6 +167,7 @@ X86ASM-OBJS-$(CONFIG_HEVC_DECODER) += 
x86/hevc_add_res.o\
   x86/hevc_deblock.o\
   x86/hevc_idct.o   \
   x86/hevc_mc.o \
+  x86/h26x/h2656_inter.o\
   x86/hevc_sao.o\
   x86/hevc_sao_10bit.o
 X86ASM-OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp.o
diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
new file mode 100644
index 00..4316c8ae3d
--- /dev/null
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -0,0 +1,1135 @@
+; /*
+; * Provide SSE luma and chroma mc functions for HEVC/VVC decoding
+; * Copyright (c) 2013 Pierre-Edouard LEPERE
+; * Copyright (c) 2023-2024 Nuo Mi
+; * Copyright (c) 2023-2024 Wu Jianhua
+; *
+; * This file is part of FFmpeg.
+; *
+; * FFmpeg is free software; you can redistribute it and/or
+; * modify it under the terms of the GNU Lesser General Public
+; * License as published by the Free Software Foundation; either
+; * version 2.1 of the License, or (at your option) any later version.
+; *
+; * FFmpeg is distributed in the hope that it will be useful,
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+; * Lesser General Public License for more details.
+; *
+; * You should have received a copy of the GNU Lesser General Public
+; * License along with FFmpeg; if not, write to the Free Software
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+; */
+%include "libavutil/x86/x86util.asm"
+
+%define MAX_PB_SIZE 64
+
+SECTION_RODATA 32
+cextern pw_255
+cextern pw_512
+cextern pw_2048
+cextern pw_1023
+cextern pw_1024
+cextern pw_4096
+cextern pw_8192
+%define scale_8 pw_512
+%define scale_10 pw_2048
+%define scale_12 pw_8192
+%define max_pixels_8 pw_255
+%define max_pixels_10 pw_1023
+max_pixels_12:  times 16 dw ((1 << 12)-1)
+cextern pb_0
+
+SECTION .text
+%macro SIMPLE_LOAD 4;width, bitd, tab, r1
+%if %1 == 2 || (%2 == 8 && %1 <= 4)
+movd  %4, [%3]   ; 
load data from source
+%elif %1 == 4 || (%2 == 8 && %1 <= 8)
+movq  %4, [%3]   ; 
load data from source
+%elif notcpuflag(avx)
+movu  %4, [%3]   ; 
load data from source
+%elif %1 <= 8 || (%2 == 8 && %1 <= 16)
+movdqu   %4, [%3]
+%else
+movu  %4, [%3]
+%endif
+%endmacro
+
+%macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b,
+vpbroadcastw   %3, [%2q + 0 * 2]  ; coeff 0, 1
+vpbroadcastw   %4, [%2q + 1 * 2]  ; coeff 2, 3
+%if %1 != 8
+pmovsxbw   %3, xmm%3
+pmovsxbw   %4, xmm%4
+%endif
+%endmacro
+
+%macro MC_4TAP_HV_FILTER 1
+vpbroadcastw  m12, [vfq + 0 * 2]  ; vf 0, 1
+vpbroadcastw  m13, [vfq + 1 * 2]  ; vf 2, 3
+vpbroadcastw  m14, [hfq + 0 * 2]  ; hf 0, 1
+vpbroadcastw  m15, [hfq + 1 * 2]  ; hf 2, 3
+
+pmovsxbw  m12, xm12
+pmovsxbw  m13, xm13
+%if %1 != 8
+pmovsxbw  m14, xm14
+pmovsxbw  m15, xm15
+%endif
+lea   r3srcq, [srcstrideq*3]
+%endmacro
+
+%macro MC_8TAP_SAVE_FILTER 5;offset, mm registers
+mova [rsp + %1 + 0*mmsize], %2
+mova [rsp + %1 + 1*mmsize], %3
+mova [rsp + %1 + 2*mmsize], %4
+mova [rsp + %1 + 3*mmsize], %5
+%endmacro
+
+%macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset
+vpbroadcastw  m12, [%2q + 0 * 2]  ; coeff 0, 1
+vpbroadcastw  m13, [%2q + 1 * 2]  ; coeff 2, 3
+vpbroadcastw  m14, [%2q + 2 * 2]  ; coeff 4, 5
+vpbroadcastw  m15, [%2q + 3 * 2]  ; coeff 6, 7
+%if %0 == 3
+MC_8TAP_SAVE_FILTER%3, m12, m13, m14, m15
+%endif
+
+%if %1 != 8
+pmovsxbw  m12, xm12
+pmovsxbw 

[FFmpeg-devel] [PATCH 4/8] avcodec/x86/h26x/h2656_inter: add dststride to put

2024-01-18 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/x86/h26x/h2656_inter.asm | 32 ++---
 libavcodec/x86/h26x/h2656dsp.c  |  4 ++--
 libavcodec/x86/h26x/h2656dsp.h  |  2 +-
 libavcodec/x86/hevcdsp_init.c   |  2 +-
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index 4316c8ae3d..68f88832a6 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -22,8 +22,6 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 
-%define MAX_PB_SIZE 64
-
 SECTION_RODATA 32
 cextern pw_255
 cextern pw_512
@@ -332,7 +330,7 @@ SECTION .text
 %endmacro
 
 %macro LOOP_END 3
-add  %1q, 2*MAX_PB_SIZE  ; dst += dststride
+add  %1q, dststrideq ; dst += dststride
 add  %2q, %3q; src += srcstride
 dec  heightd ; cmp height
 jnz   .loop  ; height loop
@@ -529,7 +527,7 @@ SECTION .text
 
 
 ; **
-; void %1_put_pixels(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void %1_put_pixels(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ; int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
@@ -539,7 +537,7 @@ SECTION .text
 %endmacro
 
 %macro MC_PIXELS 3
-cglobal %1_put_pixels%2_%3, 4, 4, 3, dst, src, srcstride, height
+cglobal %1_put_pixels%2_%3, 5, 5, 3, dst, dststride, src, srcstride, height
 pxor  m2, m2
 .loop:
 SIMPLE_LOAD   %2, %3, srcq, m0
@@ -569,10 +567,10 @@ cglobal %1_put_uni_pixels%2_%3, 5, 5, 2, dst, dststride, 
src, srcstride, height
 %endif
 
 ; **
-; void %1_put_4tap_hX(int16_t *dst,
+; void %1_put_4tap_hX(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width);
 ; **
-cglobal %1_put_4tap_h%2_%3, 5, 5, XMM_REGS, dst, src, srcstride, height, hf
+cglobal %1_put_4tap_h%2_%3, 6, 6, XMM_REGS, dst, dststride, src, srcstride, 
height, hf
 %assign %%stride ((%3 + 7)/8)
 MC_4TAP_FILTER   %3, hf, m4, m5
 .loop:
@@ -602,10 +600,10 @@ cglobal %1_put_uni_4tap_h%2_%3, 6, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 RET
 
 ; **
-; void %1_put_4tap_v(int16_t *dst,
+; void %1_put_4tap_v(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_v%2_%3, 6, 6, XMM_REGS, dst, src, srcstride, height, 
r3src, vf
+cglobal %1_put_4tap_v%2_%3, 7, 7, XMM_REGS, dst, dststride, src, srcstride, 
height, r3src, vf
 sub srcq, srcstrideq
 MC_4TAP_FILTER%3, vf, m4, m5
 lea   r3srcq, [srcstrideq*3]
@@ -639,10 +637,10 @@ cglobal %1_put_uni_4tap_v%2_%3, 7, 7, XMM_REGS, dst, 
dststride, src, srcstride,
 
 %macro PUT_4TAP_HV 3
 ; **
-; void put_4tap_hv(int16_t *dst,
+; void put_4tap_hv(int16_t *dst, ptrdiff_t dststride,
 ;  const uint8_t *_src, ptrdiff_t _srcstride, int height, int8_t *hf, 
int8_t *vf, int width)
 ; **
-cglobal %1_put_4tap_hv%2_%3, 6, 7, 16 , dst, src, srcstride, height, hf, vf, 
r3src
+cglobal %1_put_4tap_hv%2_%3, 7, 8, 16 , dst, dststride, src, srcstride, 
height, hf, vf, r3src
 %assign %%stride ((%3 + 7)/8)
 sub srcq, srcstrideq
 MC_4TAP_HV_FILTER%3
@@ -774,12 +772,12 @@ cglobal %1_put_uni_4tap_hv%2_%3, 7, 8, 16 , dst, 
dststride, src, srcstride, heig
 %endmacro
 
 ; **
-; void put_8tap_hX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_hX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;   int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
 
 %macro PUT_8TAP 3
-cglobal %1_put_8tap_h%2_%3, 5, 5, 16, dst, src, srcstride, height, hf
+cglobal %1_put_8tap_h%2_%3, 6, 6, 16, dst, dststride, src, srcstride, height, 
hf
 MC_8TAP_FILTER  %3, hf
 .loop:
 MC_8TAP_H_LOAD  %3, srcq, %2, 10
@@ -814,10 +812,10 @@ cglobal %1_put_uni_8tap_h%2_%3, 6, 7, 16 , dst, 
dststride, src, srcstride, heigh
 
 
 ; **
-; void put_8tap_vX_X_X(int16_t *dst, const uint8_t *_src, ptrdiff_t srcstride,
+; void put_8tap_vX_X_X(int16_t *dst, ptrdiff_t dststride, const uint8_t *_src, 
ptrdiff_t srcstride,
 ;  int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; **
-cglobal %1_put_8tap_v%2_%3, 6, 8, 16, dst, src, srcstride, height, r3src, vf
+cglobal %1_put_8tap_v%2_%3, 7, 8, 16, dst, dststride, src, srcstride, height, 

[FFmpeg-devel] [PATCH 2/8] avcodec/hevcdsp_template: reuse put/put_luma/put_chroma from h2656_inter_template

2024-01-18 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/hevcdsp_template.c | 594 +++---
 1 file changed, 46 insertions(+), 548 deletions(-)

diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 0de14e9dcf..9b48bdf08e 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -26,6 +26,7 @@
 #include "bit_depth_template.c"
 #include "hevcdsp.h"
 #include "h26x/h2656_sao_template.c"
+#include "h26x/h2656_inter_template.c"
 
 static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int 
height,
   GetBitContext *gb, int pcm_bit_depth)
@@ -299,37 +300,51 @@ IDCT_DC(32)
 

 //
 

-static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-  const uint8_t *_src, ptrdiff_t 
_srcstride,
-  int height, intptr_t mx, intptr_t my, 
int width)
-{
-int x, y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-for (x = 0; x < width; x++)
-dst[x] = src[x] << (14 - BIT_DEPTH);
-src += srcstride;
-dst += MAX_PB_SIZE;
-}
-}
-
-static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, 
const uint8_t *_src, ptrdiff_t _srcstride,
-  int height, intptr_t mx, intptr_t 
my, int width)
-{
-int y;
-const pixel *src= (const pixel *)_src;
-ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-pixel *dst  = (pixel *)_dst;
-ptrdiff_t dststride = _dststride / sizeof(pixel);
-
-for (y = 0; y < height; y++) {
-memcpy(dst, src, width * sizeof(pixel));
-src += srcstride;
-dst += dststride;
-}
-}
+#define ff_hevc_pel_filters ff_hevc_qpel_filters
+#define DECL_HV_FILTER(f)  \
+const uint8_t *hf = ff_hevc_ ## f ## _filters[mx - 1]; \
+const uint8_t *vf = ff_hevc_ ## f ## _filters[my - 1];
+
+#define FW_PUT(p, f, t)
   \
+static void FUNC(put_hevc_## f)(int16_t *dst, const uint8_t *src, ptrdiff_t 
srcstride, int height,\
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, src, srcstride, height, hf, vf, width);   
   \
+}
+
+#define FW_PUT_UNI(p, f, t)
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride, int height, intptr_t 
mx, intptr_t my, int width)   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, hf, vf, width);
   \
+}
+
+#define FW_PUT_UNI_W(p, f, t)  
   \
+static void FUNC(put_hevc_ ## f)(uint8_t *dst, ptrdiff_t dststride, const 
uint8_t *src,   \
+  ptrdiff_t srcstride,int height, int denom, 
int wx, int ox,  \
+  intptr_t mx, intptr_t my, int width) 
   \
+{  
   \
+DECL_HV_FILTER(p)  
   \
+FUNC(put_ ## t)(dst, dststride, src, srcstride, height, denom, wx, ox, hf, 
vf, width);\
+}
+
+#define FW_PUT_FUNCS(f, t, dir)   \
+FW_PUT(f, f ## _ ## dir, t ## _ ## dir) \
+FW_PUT_UNI(f, f ## _uni_ ## dir, uni_ ## t ## _ ## dir)\
+FW_PUT_UNI_W(f, f ## _uni_w_ ## dir, uni_## t ## _w_ ## dir)
+
+FW_PUT(pel, pel_pixels, pixels)
+FW_PUT_UNI(pel, pel_uni_pixels, uni_pixels)
+FW_PUT_UNI_W(pel, pel_uni_w_pixels, uni_w_pixels)
+
+FW_PUT_FUNCS(qpel, luma,   h )
+FW_PUT_FUNCS(qpel, luma,   v )
+FW_PUT_FUNCS(qpel, luma,   hv)
+FW_PUT_FUNCS(epel, chroma, h )
+FW_PUT_FUNCS(epel, chroma, v )
+FW_PUT_FUNCS(epel, chroma, hv)
 
 static void FUNC(put_hevc_pel_bi_pixels)(uint8_t 

[FFmpeg-devel] [PATCH 1/8] avcodec/vvc/vvc_inter_template: move put/put_luma/put_chroma template to h2656_inter_template.c

2024-01-18 Thread toqsxw
From: Wu Jianhua 

Signed-off-by: Wu Jianhua 
---
 libavcodec/h26x/h2656_inter_template.c | 577 +
 libavcodec/vvc/vvc_inter_template.c| 559 +---
 2 files changed, 578 insertions(+), 558 deletions(-)
 create mode 100644 libavcodec/h26x/h2656_inter_template.c

diff --git a/libavcodec/h26x/h2656_inter_template.c 
b/libavcodec/h26x/h2656_inter_template.c
new file mode 100644
index 00..864f6c7e7d
--- /dev/null
+++ b/libavcodec/h26x/h2656_inter_template.c
@@ -0,0 +1,577 @@
+/*
+ * inter prediction template for HEVC/VVC
+ *
+ * Copyright (C) 2022 Nuo Mi
+ * Copyright (C) 2024 Wu Jianhua
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define CHROMA_EXTRA_BEFORE 1
+#define CHROMA_EXTRA3
+#define LUMA_EXTRA_BEFORE   3
+#define LUMA_EXTRA  7
+
+static void FUNC(put_pixels)(int16_t *dst,
+const uint8_t *_src, const ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = src[x] << (14 - BIT_DEPTH);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void FUNC(put_uni_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+ const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+
+for (int y = 0; y < height; y++) {
+memcpy(dst, src, width * sizeof(pixel));
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+static void FUNC(put_uni_w_pixels)(uint8_t *_dst, const ptrdiff_t _dst_stride,
+const uint8_t *_src, const ptrdiff_t _src_stride, const int height,
+const int denom, const int wx, const int _ox,  const int8_t *hf, const 
int8_t *vf,
+const int width)
+{
+const pixel *src= (const pixel *)_src;
+pixel *dst  = (pixel *)_dst;
+const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
+const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
+const int shift = denom + 14 - BIT_DEPTH;
+#if BIT_DEPTH < 14
+const int offset= 1 << (shift - 1);
+#else
+const int offset= 0;
+#endif
+const int ox= _ox * (1 << (BIT_DEPTH - 8));
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++) {
+const int v = (src[x] << (14 - BIT_DEPTH));
+dst[x] = av_clip_pixel(((v * wx + offset) >> shift) + ox);
+}
+src += src_stride;
+dst += dst_stride;
+}
+}
+
+#define LUMA_FILTER(src, stride)   
\
+(filter[0] * src[x - 3 * stride] + 
\
+ filter[1] * src[x - 2 * stride] + 
\
+ filter[2] * src[x - stride] + 
\
+ filter[3] * src[x ] + 
\
+ filter[4] * src[x + stride] + 
\
+ filter[5] * src[x + 2 * stride] + 
\
+ filter[6] * src[x + 3 * stride] + 
\
+ filter[7] * src[x + 4 * stride])
+
+static void FUNC(put_luma_h)(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+const int height, const int8_t *hf, const int8_t *vf, const int width)
+{
+const pixel *src   = (const pixel*)_src;
+const ptrdiff_t src_stride = _src_stride / sizeof(pixel);
+const int8_t *filter   = hf;
+
+for (int y = 0; y < height; y++) {
+for (int x = 0; x < width; x++)
+dst[x] = LUMA_FILTER(src, 1) >> (BIT_DEPTH - 8);
+src += src_stride;
+dst += MAX_PB_SIZE;
+}
+}
+
+static void