[FFmpeg-cvslog] checkasm: collapse hevc pel tests
ffmpeg | branch: master | J. Dekker | Thu Aug 5 10:26:48 2021 +0200| [b492cacffd36ad4cb251ba1f13ac398318ee639a] | committer: Thilo Borgmann checkasm: collapse hevc pel tests Also add to `make fate-checkasm' target. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=b492cacffd36ad4cb251ba1f13ac398318ee639a --- tests/checkasm/checkasm.c | 11 +-- tests/checkasm/checkasm.h | 11 +-- tests/checkasm/hevc_pel.c | 34 -- tests/fate/checkasm.mak | 1 + 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index be5c17cd2a..b1353f7cbe 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -116,16 +116,7 @@ static const struct { #if CONFIG_HEVC_DECODER { "hevc_add_res", checkasm_check_hevc_add_res }, { "hevc_idct", checkasm_check_hevc_idct }, -{ "hevc_qpel", checkasm_check_hevc_qpel }, -{ "hevc_qpel_uni", checkasm_check_hevc_qpel_uni }, -{ "hevc_qpel_uni_w", checkasm_check_hevc_qpel_uni_w }, -{ "hevc_qpel_bi", checkasm_check_hevc_qpel_bi }, -{ "hevc_qpel_bi_w", checkasm_check_hevc_qpel_bi_w }, -{ "hevc_epel", checkasm_check_hevc_epel }, -{ "hevc_epel_uni", checkasm_check_hevc_epel_uni }, -{ "hevc_epel_uni_w", checkasm_check_hevc_epel_uni_w }, -{ "hevc_epel_bi", checkasm_check_hevc_epel_bi }, -{ "hevc_epel_bi_w", checkasm_check_hevc_epel_bi_w }, +{ "hevc_pel", checkasm_check_hevc_pel }, { "hevc_sao", checkasm_check_hevc_sao }, #endif #if CONFIG_HUFFYUV_DECODER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index b747ed1986..68b0697d3e 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -61,16 +61,7 @@ void checkasm_check_h264pred(void); void checkasm_check_h264qpel(void); void checkasm_check_hevc_add_res(void); void checkasm_check_hevc_idct(void); -void checkasm_check_hevc_qpel(void); -void checkasm_check_hevc_qpel_uni(void); -void checkasm_check_hevc_qpel_uni_w(void); -void checkasm_check_hevc_qpel_bi(void); -void checkasm_check_hevc_qpel_bi_w(void); -void checkasm_check_hevc_epel(void); -void checkasm_check_hevc_epel_uni(void); -void checkasm_check_hevc_epel_uni_w(void); -void checkasm_check_hevc_epel_bi(void); -void checkasm_check_hevc_epel_bi_w(void); +void checkasm_check_hevc_pel(void); void checkasm_check_hevc_sao(void); void checkasm_check_huffyuvdsp(void); void checkasm_check_jpeg2000dsp(void); diff --git a/tests/checkasm/hevc_pel.c b/tests/checkasm/hevc_pel.c index 4d1545e467..ec24309081 100644 --- a/tests/checkasm/hevc_pel.c +++ b/tests/checkasm/hevc_pel.c @@ -65,7 +65,7 @@ static const int offsets[] = {0, 255, -1 }; #define src0 (buf0 + 2 * 4 * MAX_PB_SIZE) /* hevc qpel functions read data from negative src pointer offsets */ #define src1 (buf1 + 2 * 4 * MAX_PB_SIZE) -void checkasm_check_hevc_qpel(void) +static void checkasm_check_hevc_qpel(void) { LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); @@ -109,7 +109,7 @@ void checkasm_check_hevc_qpel(void) report("qpel"); } -void checkasm_check_hevc_qpel_uni(void) +static void checkasm_check_hevc_qpel_uni(void) { LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); @@ -150,7 +150,7 @@ void checkasm_check_hevc_qpel_uni(void) report("qpel_uni"); } -void checkasm_check_hevc_qpel_uni_w(void) +static void checkasm_check_hevc_qpel_uni_w(void) { LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); @@ -198,7 +198,7 @@ void checkasm_check_hevc_qpel_uni_w(void) report("qpel_uni_w"); } -void checkasm_check_hevc_qpel_bi(void) +static void checkasm_check_hevc_qpel_bi(void) { LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); @@ -242,7 +242,7 @@ void checkasm_check_hevc_qpel_bi(void) report("qpel_bi"); } -void checkasm_check_hevc_qpel_bi_w(void) +static void checkasm_check_hevc_qpel_bi_w(void) { LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); @@ -294,7 +294,7 @@ void checkasm_check_hevc_qpel_bi_w(void) report("qpel_bi_w"); } -void checkasm_check_hevc_epel(void) +static void checkasm_check_hevc_epel(void) { LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); @@ -338,7 +338,7 @@ void checkasm_check_hevc_epel(void) report("epel"); } -void checkasm_check_hevc_epel_uni(void) +static void checkasm_check_hevc_epel_uni(void) { LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_
[FFmpeg-cvslog] lavu/checkasm: add (private) kperf timing for macOS
ffmpeg | branch: master | J. Dekker | Tue Jul 20 19:09:22 2021 +0200| [9a727235fd497c22f2370e48dd1443d1376953e7] | committer: J. Dekker lavu/checkasm: add (private) kperf timing for macOS Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9a727235fd497c22f2370e48dd1443d1376953e7 --- configure | 2 + libavutil/Makefile| 1 + libavutil/macos_kperf.c | 144 ++ libavutil/macos_kperf.h | 22 +++ libavutil/timer.h | 15 - tests/checkasm/checkasm.c | 14 - tests/checkasm/checkasm.h | 7 ++- 7 files changed, 201 insertions(+), 4 deletions(-) diff --git a/configure b/configure index f69e3a6fe7..bb6b096414 100755 --- a/configure +++ b/configure @@ -488,6 +488,7 @@ Developer options (useful when working on FFmpeg itself): --ignore-tests=TESTS comma-separated list (without "fate-" prefix in the name) of tests whose result is ignored --enable-linux-perf enable Linux Performance Monitor API + --enable-macos-kperf enable macOS kperf (private) API --disable-large-testsdisable tests that use a large amount of memory --disable-ptx-compression don't compress CUDA PTX code even when possible @@ -1977,6 +1978,7 @@ CONFIG_LIST=" fontconfig large_tests linux_perf +macos_kperf memory_poisoning neon_clobber_test ossfuzz diff --git a/libavutil/Makefile b/libavutil/Makefile index 47efb718d2..18dc5f22d9 100644 --- a/libavutil/Makefile +++ b/libavutil/Makefile @@ -181,6 +181,7 @@ OBJS-$(CONFIG_D3D11VA) += hwcontext_d3d11va.o OBJS-$(CONFIG_DXVA2)+= hwcontext_dxva2.o OBJS-$(CONFIG_LIBDRM) += hwcontext_drm.o OBJS-$(CONFIG_LZO) += lzo.o +OBJS-$(CONFIG_MACOS_KPERF) += macos_kperf.o OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o OBJS-$(CONFIG_QSV) += hwcontext_qsv.o diff --git a/libavutil/macos_kperf.c b/libavutil/macos_kperf.c new file mode 100644 index 00..cb229130f5 --- /dev/null +++ b/libavutil/macos_kperf.c @@ -0,0 +1,144 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "macos_kperf.h" +#include +#include +#include + +#define KPERF_LIST \ +F(int, kpc_get_counting, void) \ +F(int, kpc_force_all_ctrs_set, int)\ +F(int, kpc_set_counting, uint32_t) \ +F(int, kpc_set_thread_counting, uint32_t) \ +F(int, kpc_set_config, uint32_t, void *) \ +F(int, kpc_get_config, uint32_t, void *) \ +F(int, kpc_set_period, uint32_t, void *) \ +F(int, kpc_get_period, uint32_t, void *) \ +F(uint32_t, kpc_get_counter_count, uint32_t) \ +F(uint32_t, kpc_get_config_count, uint32_t)\ +F(int, kperf_sample_get, int *)\ +F(int, kpc_get_thread_counters, int, unsigned int, void *) + +#define F(ret, name, ...) \ +typedef ret name##proc(__VA_ARGS__); \ +static name##proc *name = NULL; +KPERF_LIST +#undef F + +#define CFGWORD_EL0A32EN_MASK (0x1) +#define CFGWORD_EL0A64EN_MASK (0x2) +#define CFGWORD_EL1EN_MASK(0x4) +#define CFGWORD_EL3EN_MASK(0x8) +#define CFGWORD_ALLMODES_MASK (0xf) + +#define CPMU_NONE 0 +#define CPMU_CORE_CYCLE 0x02 +#define CPMU_INST_A64 0x8c +#define CPMU_INST_BRANCH 0x8d +#define CPMU_SYNC_DC_LOAD_MISS 0xbf +#define CPMU_SYNC_DC_STORE_MISS 0xc0 +#define CPMU_SYNC_DTLB_MISS 0xc1 +#define CPMU_SYNC_ST_HIT_YNGR_LD 0xc4 +#define CPMU_SYNC_BR_ANY_MISP 0xcb +#define CPMU_FED_IC_MISS_DEM 0xd3 +#define CPMU_FED_ITLB_MISS 0xd4 + +#define KPC_CLASS_FIXED_MASK(1 << 0) +#define KPC_CLASS_CONFIGURABLE_MASK (1 << 1) +#define KPC_CLASS_POWER_MASK(1 << 2) +#define KPC_CLASS_RAWPMU_MASK (1 << 3) + +#define COUNTERS_COUNT 10 +#define CONFIG_CO
[FFmpeg-cvslog] lavu/kperf: use ff_thread_once()
ffmpeg | branch: master | J. Dekker | Wed Jul 21 16:21:34 2021 +0200| [c866a099b297203306165be3c444d481fcb22553] | committer: J. Dekker lavu/kperf: use ff_thread_once() Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c866a099b297203306165be3c444d481fcb22553 --- libavutil/macos_kperf.c | 65 +-- libavutil/macos_kperf.h | 2 +- libavutil/timer.h | 7 +++-- tests/checkasm/checkasm.c | 6 + 4 files changed, 22 insertions(+), 58 deletions(-) diff --git a/libavutil/macos_kperf.c b/libavutil/macos_kperf.c index cb229130f5..9fc04c6349 100644 --- a/libavutil/macos_kperf.c +++ b/libavutil/macos_kperf.c @@ -16,7 +16,10 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ +#include "avassert.h" #include "macos_kperf.h" +#include "thread.h" + #include #include #include @@ -68,69 +71,35 @@ KPERF_LIST #define CONFIG_COUNT 8 #define KPC_MASK (KPC_CLASS_CONFIGURABLE_MASK | KPC_CLASS_FIXED_MASK) -static int ff_kperf_was_init = 0; - -int ff_kperf_init() +static void kperf_init(void) { uint64_t config[COUNTERS_COUNT] = {0}; void *kperf = NULL; -if (ff_kperf_was_init) -return 0; +av_assert0(kperf = dlopen("/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf", RTLD_LAZY)); -kperf = dlopen("/System/Library/PrivateFrameworks/kperf.framework/Versions/A/kperf", RTLD_LAZY); -if (!kperf) { -fprintf(stderr, "kperf: kperf = %p\n", kperf); -return -1; -} - -#define F(ret, name, ...)\ -name = (name##proc *)(dlsym(kperf, #name)); \ -if (!name) { \ -fprintf(stderr, "kperf: %s = %p\n", #name, (void *)name);\ -return -1; \ -} +#define F(ret, name, ...) av_assert0(name = (name##proc *)(dlsym(kperf, #name))); KPERF_LIST #undef F -if (kpc_get_counter_count(KPC_MASK) != COUNTERS_COUNT) { -fprintf(stderr, "kperf: wrong fixed counters count\n"); -return -1; -} - -if (kpc_get_config_count(KPC_MASK) != CONFIG_COUNT) { -fprintf(stderr, "kperf: wrong fixed config count\n"); -return -1; -} +av_assert0(kpc_get_counter_count(KPC_MASK) == COUNTERS_COUNT); +av_assert0(kpc_get_config_count(KPC_MASK) == CONFIG_COUNT); config[0] = CPMU_CORE_CYCLE | CFGWORD_EL0A64EN_MASK; // config[3] = CPMU_INST_BRANCH | CFGWORD_EL0A64EN_MASK; // config[4] = CPMU_SYNC_BR_ANY_MISP | CFGWORD_EL0A64EN_MASK; // config[5] = CPMU_INST_A64 | CFGWORD_EL0A64EN_MASK; -if (kpc_set_config(KPC_MASK, config)) { -fprintf(stderr, "kperf: kpc_set_config failed\n"); -return -1; -} - -if (kpc_force_all_ctrs_set(1)) { -fprintf(stderr, "kperf: kpc_force_all_ctrs_set failed\n"); -return -1; -} - -if (kpc_set_counting(KPC_MASK)) { -fprintf(stderr, "kperf: kpc_set_counting failed\n"); -return -1; -} - -if (kpc_set_thread_counting(KPC_MASK)) { -fprintf(stderr, "kperf: kpc_set_thread_counting failed\n"); -return -1; -} - -ff_kperf_was_init = 1; +av_assert0(kpc_set_config(KPC_MASK, config) == 0 || !"the kperf API needs to be run as root"); +av_assert0(kpc_force_all_ctrs_set(1) == 0); +av_assert0(kpc_set_counting(KPC_MASK) == 0); +av_assert0(kpc_set_thread_counting(KPC_MASK) == 0); +} -return 0; +void ff_kperf_init(void) +{ +static AVOnce init_static_once = AV_ONCE_INIT; +ff_thread_once(_static_once, kperf_init); } uint64_t ff_kperf_cycles() diff --git a/libavutil/macos_kperf.h b/libavutil/macos_kperf.h index 63b004214e..d039691340 100644 --- a/libavutil/macos_kperf.h +++ b/libavutil/macos_kperf.h @@ -21,7 +21,7 @@ #include -int ff_kperf_init(void); +void ff_kperf_init(void); uint64_t ff_kperf_cycles(void); #endif /* AVUTIL_MACOS_KPERF_H */ diff --git a/libavutil/timer.h b/libavutil/timer.h index 1cf384d772..71ea2f912e 100644 --- a/libavutil/timer.h +++ b/libavutil/timer.h @@ -131,12 +131,11 @@ #define START_TIMER \ uint64_t tperf; \ -if (ff_kperf_init())\ -av_log(NULL, AV_LOG_ERROR, "ff_kperf_init() failed\n"); \ -tperf = kperf_cycles(); +ff_kperf_init();\ +tperf = ff_kperf_cycles(); #define STOP_TIMER(id) \ -TIMER_REPORT(id, kperf_cycles() - tperf); +TIMER_REPORT(id, ff_kperf_cycles() - tperf); #elif defined(AV_READ_TIME) #define START_TIMER
[FFmpeg-cvslog] lavc/aarch64: add hevc sao edge 16x16
ffmpeg | branch: master | J. Dekker | Thu Oct 7 16:30:54 2021 +0200| [a9214a2ca31c9d54f893c5ac4004a5ff30a08d10] | committer: J. Dekker lavc/aarch64: add hevc sao edge 16x16 bench on AWS Graviton: hevc_sao_edge_16x16_8_c: 1857.0 hevc_sao_edge_16x16_8_neon: 211.0 hevc_sao_edge_32x32_8_c: 7802.2 hevc_sao_edge_32x32_8_neon: 808.2 hevc_sao_edge_48x48_8_c: 16764.2 hevc_sao_edge_48x48_8_neon: 1796.5 hevc_sao_edge_64x64_8_c: 32647.5 hevc_sao_edge_64x64_8_neon: 3118.5 Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a9214a2ca31c9d54f893c5ac4004a5ff30a08d10 --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 8 +++- libavcodec/aarch64/hevcdsp_sao_neon.S | 65 +++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index c785e46f79..747ff0412d 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int16_t *sao_offset_val, int sao_left_class, int width, int height); - - +void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, + int16_t *sao_offset_val, int eo, int width, int height); av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) { @@ -76,6 +76,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon; +c->sao_edge_filter[1] = +c->sao_edge_filter[2] = +c->sao_edge_filter[3] = +c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon; } if (bit_depth == 10) { c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index f9fed8345b..4b895959d8 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -85,3 +85,68 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 bne 1b ret endfunc + +// ASSUMES STRIDE_SRC = 192 +.Lsao_edge_pos: +.word 1 // horizontal +.word 192 // vertical +.word 192 + 1 // 45 degree +.word 192 - 1 // 135 degree + +// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst, +// int16 *sao_offset_val, int eo, int width, int height) +function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 +adr x7, .Lsao_edge_pos +ld1 {v3.8h}, [x3] // load sao_offset_val +sxtwx5, w5 +ldr w4, [x7, w4, uxtw #2] // stride_src +mov v3.h[7], v3.h[0] // reorder to [1,2,0,3,4] +mov v3.h[0], v3.h[1] +mov v3.h[1], v3.h[2] +mov v3.h[2], v3.h[7] +// split 16bit values into two tables +uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper +uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower +moviv2.16b, #2 +mov x15, #192 +// strides between end of line and next src/dst +sub x15, x15, x5 // stride_src - width +sub x16, x2, x5// stride_dst - width +mov x11, x1// copy base src +1: // new line +mov x14, x5// copy width +sub x12, x11, x4 // src_a (prev) = src - sao_edge_pos +add x13, x11, x4 // src_b (next) = src + sao_edge_pos +2: // process 16 bytes +ld1 {v3.16b}, [x11], #16 // load src +ld1 {v4.16b}, [x12], #16 // load src_a (prev) +ld1 {v5.16b}, [x13], #16 // load src_b (next) +cmhiv16.16b, v4.16b, v3.16b// (prev > cur) +cmhiv17.16b, v3.16b, v4.16b// (cur > prev) +cmhiv18.16b, v5.16b, v3.16b// (next > cur) +cmhiv19.16b, v3.16b, v5.16b// (cur > next) +sub v20.16b, v16.16b, v17.16b // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev) +sub v21.16b, v18.16b, v19.16b // diff1 = CMP(cur, next) = (cur > next) - (cur <
[FFmpeg-cvslog] lavc/arm: dont assign hevc_qpel functions for non-multiple of 8 widths
ffmpeg | branch: master | J. Dekker | Sat Oct 16 19:35:51 2021 +0200| [22b7c37275c611b5417722d8941844028aed7f25] | committer: J. Dekker lavc/arm: dont assign hevc_qpel functions for non-multiple of 8 widths The assembly is written assuming that the width is a multiple of 8. However the real issue is the functions were errorneously assigned to the 2, 4, 6 & 12 widths. This behaviour never broke the decoder as samples which trigger the functions for these widths have not been found in the wild. This relies on the mappings in ff_hevc_pel_weight[]. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=22b7c37275c611b5417722d8941844028aed7f25 --- libavcodec/arm/hevcdsp_init_neon.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c index 201a088dac..112edb5edd 100644 --- a/libavcodec/arm/hevcdsp_init_neon.c +++ b/libavcodec/arm/hevcdsp_init_neon.c @@ -270,7 +270,8 @@ av_cold void ff_hevc_dsp_init_neon(HEVCDSPContext *c, const int bit_depth) put_hevc_qpel_uw_neon[3][1] = ff_hevc_put_qpel_uw_h1v3_neon_8; put_hevc_qpel_uw_neon[3][2] = ff_hevc_put_qpel_uw_h2v3_neon_8; put_hevc_qpel_uw_neon[3][3] = ff_hevc_put_qpel_uw_h3v3_neon_8; -for (x = 0; x < 10; x++) { +for (x = 3; x < 10; x++) { +if (x == 4) continue; c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper; c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper; c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper; ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: add hevc sao edge 8x8
ffmpeg | branch: master | J. Dekker | Thu Oct 7 16:30:55 2021 +0200| [c97ffc1a77ccaf901e642bd21ed26aaf75557745] | committer: J. Dekker lavc/aarch64: add hevc sao edge 8x8 bench on AWS Graviton: hevc_sao_edge_8x8_8_c: 516.0 hevc_sao_edge_8x8_8_neon: 81.0 Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c97ffc1a77ccaf901e642bd21ed26aaf75557745 --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 3 ++ libavcodec/aarch64/hevcdsp_sao_neon.S | 51 +++ 2 files changed, 54 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 747ff0412d..b93cec9e44 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -59,6 +59,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src, int width, int height); void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, int16_t *sao_offset_val, int eo, int width, int height); +void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, + int16_t *sao_offset_val, int eo, int width, int height); av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) { @@ -76,6 +78,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon; +c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8x8_8_neon; c->sao_edge_filter[1] = c->sao_edge_filter[2] = c->sao_edge_filter[3] = diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index 4b895959d8..167b9676d8 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -150,3 +150,54 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 // no lines to filter ret endfunc + +// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst, +//int16 *sao_offset_val, int eo, int width, int height) +function ff_hevc_sao_edge_filter_8x8_8_neon, export=1 +adr x7, .Lsao_edge_pos +ldr w4, [x7, w4, uxtw #2] +ld1 {v3.8h}, [x3] +mov v3.h[7], v3.h[0] +mov v3.h[0], v3.h[1] +mov v3.h[1], v3.h[2] +mov v3.h[2], v3.h[7] +uzp2v1.16b, v3.16b, v3.16b +uzp1v0.16b, v3.16b, v3.16b +moviv2.16b, #2 +add x16, x0, x2 +lsl x2, x2, #1 +mov x15, #192 +mov x8, x1 +sub x9, x1, x4 +add x10, x1, x4 +mov x17, #4 +1: ld1 {v3.d}[0], [ x8], x15 +ld1 {v4.d}[0], [ x9], x15 +ld1 {v5.d}[0], [x10], x15 +ld1 {v3.d}[1], [ x8], x15 +ld1 {v4.d}[1], [ x9], x15 +ld1 {v5.d}[1], [x10], x15 +cmhiv16.16b, v4.16b, v3.16b +cmhiv17.16b, v3.16b, v4.16b +cmhiv18.16b, v5.16b, v3.16b +cmhiv19.16b, v3.16b, v5.16b +sub v20.16b, v16.16b, v17.16b +sub v21.16b, v18.16b, v19.16b +add v20.16b, v20.16b, v21.16b +add v20.16b, v20.16b, v2.16b +tbl v16.16b, {v0.16b}, v20.16b +tbl v17.16b, {v1.16b}, v20.16b +uxtlv20.8h, v3.8b +uxtl2 v21.8h, v3.16b +zip1v18.16b, v16.16b, v17.16b +zip2v19.16b, v16.16b, v17.16b +sqadd v20.8h, v18.8h, v20.8h +sqadd v21.8h, v19.8h, v21.8h +sqxtun v6.8b, v20.8h +sqxtun v7.8b, v21.8h +st1 {v6.8b}, [ x0], x2 +st1 {v7.8b}, [x16], x2 +subsx17, x17, #1 +b.ne1b +ret +endfunc ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] Revert "arm: hevc_qpel: Fix the assembly to work with non-multiple of 8 widths"
ffmpeg | branch: master | J. Dekker | Sat Oct 16 19:35:52 2021 +0200| [7fc6015de9a833868699e14880a2f3d0f187c9cc] | committer: J. Dekker Revert "arm: hevc_qpel: Fix the assembly to work with non-multiple of 8 widths" This reverts commit 2589060b92eeeb944c6e2b50e38412c0c5fabcf4 which was originally to fix the FATE test. The real cause of the test breakage was fixed in 22b7c37275c611b5417722d8941844028aed7f25. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=7fc6015de9a833868699e14880a2f3d0f187c9cc --- libavcodec/arm/hevcdsp_qpel_neon.S | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S index f71bec05ed..caa6efa766 100644 --- a/libavcodec/arm/hevcdsp_qpel_neon.S +++ b/libavcodec/arm/hevcdsp_qpel_neon.S @@ -237,7 +237,7 @@ vld1.8{d23}, [r2], r3 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #16 mov r0, r6 @@ -280,7 +280,7 @@ vld1.8{d23}, [r2], r3 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #8 mov r0, r6 @@ -310,7 +310,7 @@ vld1.8{d23}, [r2], r3 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #8 mov r0, r6 @@ -377,7 +377,7 @@ endfunc vst1.16 {q7}, [r0], r1 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #16 mov r0, r6 @@ -417,7 +417,7 @@ endfunc vst1.8d0, [r0], r1 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #8 mov r0, r6 @@ -446,7 +446,7 @@ endfunc vst1.8 d0, [r0], r1 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #8 add r10, #16 @@ -533,7 +533,7 @@ endfunc \filterh q7 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #16 mov r0, r6 @@ -594,7 +594,7 @@ endfunc \filterh q7 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #8 mov r0, r6 @@ -641,7 +641,7 @@ endfunc \filterh q7 bne 8b subs r5, #8 -ble 99f +beq 99f mov r4, r12 add r6, #8 mov r0, r6 ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: clean-up sao band 8x8 function formatting
ffmpeg | branch: master | J. Dekker | Wed Dec 15 20:06:20 2021 +0100| [89a2ed4a8b72683d0a8dfcb8c16c7a97eb740d5b] | committer: J. Dekker lavc/aarch64: clean-up sao band 8x8 function formatting Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=89a2ed4a8b72683d0a8dfcb8c16c7a97eb740d5b --- libavcodec/aarch64/hevcdsp_sao_neon.S | 65 ++- 1 file changed, 25 insertions(+), 40 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index 167b9676d8..73b0b3b056 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -30,24 +30,21 @@ // int width, int height) function ff_hevc_sao_band_filter_8x8_8_neon, export=1 sub sp, sp, #64 -stpxzr, xzr, [sp] -stpxzr, xzr, [sp, #16] -stpxzr, xzr, [sp, #32] -stpxzr, xzr, [sp, #48] +stp xzr, xzr, [sp] +stp xzr, xzr, [sp, #16] +stp xzr, xzr, [sp, #32] +stp xzr, xzr, [sp, #48] mov w8, #4 -0: -ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1] +0: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1] subsw8, w8, #1 -addw10, w8, w5 // x10 = k + sao_left_class -andw10, w10, #0x1F +add w10, w8, w5 // k + sao_left_class +and w10, w10, #0x1F strhw9, [sp, x10, lsl #1] bne 0b -ld1{v16.16b-v19.16b}, [sp], #64 -movi v20.8h, #1 -1: // beginning of line -mov w8, w6 -2: -// Simple layout for accessing 16bit values +ld1 {v16.16b-v19.16b}, [sp], #64 +moviv20.8h, #1 +1: mov w8, w6// beginning of line +2: // Simple layout for accessing 16bit values // with 8bit LUT. // // 00 01 02 03 04 05 06 07 @@ -55,33 +52,21 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED| // +---> //i-0 i-1 i-2 i-3 -// dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); -ld1{v2.8b}, [x1] -// load src[x] -uxtlv0.8h, v2.8b -// >> shift -ushrv2.8h, v0.8h, #3 // BIT_DEPTH - 3 -// x2 (access lower short) -shl v1.8h, v2.8h, #1 // low (x2, accessing short) -// +1 access upper short -add v3.8h, v1.8h, v20.8h -// shift insert index to upper byte -sli v1.8h, v3.8h, #8 -// table -tbxv2.16b, {v16.16b-v19.16b}, v1.16b -// src[x] + table -add v1.8h, v0.8h, v2.8h -// clip + narrow -sqxtun v4.8b, v1.8h -// store -st1{v4.8b}, [x0] -// done 8 pixels -subsw8, w8, #8 +ld1 {v2.8b}, [x1] // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); +uxtlv0.8h, v2.8b // load src[x] +ushrv2.8h, v0.8h, #3 // >> BIT_DEPTH - 3 +shl v1.8h, v2.8h, #1 // low (x2, accessing short) +add v3.8h, v1.8h, v20.8h // +1 access upper short +sli v1.8h, v3.8h, #8 // shift insert index to upper byte +tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table +add v1.8h, v0.8h, v2.8h // src[x] + table +sqxtun v4.8b, v1.8h // clip + narrow +st1 {v4.8b}, [x0] // store +subsw8, w8, #8// done 8 pixels bne 2b -// finished line -subsw7, w7, #1 -add x0, x0, x2 // dst += stride_dst -add x1, x1, x3 // src += stride_src +subsw7, w7, #1// finished line, prep. new +add x0, x0, x2// dst += stride_dst +add x1, x1, x3// src += stride_src bne 1b ret endfunc ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: add hevc sao band 8x8 tiling
ffmpeg | branch: master | J. Dekker | Wed Nov 17 05:56:13 2021 +0100| [f63f9be37c799ddc835af358034630d31fb7db02] | committer: J. Dekker lavc/aarch64: add hevc sao band 8x8 tiling bench on AWS Graviton: hevc_sao_band_8x8_8_c: 317.5 hevc_sao_band_8x8_8_neon: 97.5 hevc_sao_band_16x16_8_c: 1115.0 hevc_sao_band_16x16_8_neon: 322.7 hevc_sao_band_32x32_8_c: 4599.2 hevc_sao_band_32x32_8_neon: 1246.2 hevc_sao_band_48x48_8_c: 10021.7 hevc_sao_band_48x48_8_neon: 2740.5 hevc_sao_band_64x64_8_c: 17635.0 hevc_sao_band_64x64_8_neon: 4875.7 Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f63f9be37c799ddc835af358034630d31fb7db02 --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 +- libavcodec/aarch64/hevcdsp_sao_neon.S | 11 +++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index b93cec9e44..2002530266 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -77,7 +77,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; -c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon; +c->sao_band_filter[0] = +c->sao_band_filter[1] = +c->sao_band_filter[2] = +c->sao_band_filter[3] = +c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon; c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8x8_8_neon; c->sao_edge_filter[1] = c->sao_edge_filter[2] = diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index 73b0b3b056..d524323fe8 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -3,7 +3,7 @@ * * AArch64 NEON optimised SAO functions for HEVC decoding * - * Copyright (c) 2020 Josh Dekker + * Copyright (c) 2020-2021 J. Dekker * * This file is part of FFmpeg. * @@ -35,6 +35,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 stp xzr, xzr, [sp, #32] stp xzr, xzr, [sp, #48] mov w8, #4 +sxtwx6, w6 0: ldrsh x9, [x4, x8, lsl #1] // sao_offset_val[k+1] subsw8, w8, #1 add w10, w8, w5 // k + sao_left_class @@ -43,7 +44,9 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 bne 0b ld1 {v16.16b-v19.16b}, [sp], #64 moviv20.8h, #1 -1: mov w8, w6// beginning of line +sub x2, x2, x6// stride_dst - width +sub x3, x3, x6// stride_src - width +1: mov x8, x6// beginning of line 2: // Simple layout for accessing 16bit values // with 8bit LUT. // @@ -52,7 +55,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED| // +---> //i-0 i-1 i-2 i-3 -ld1 {v2.8b}, [x1] // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); +ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); uxtlv0.8h, v2.8b // load src[x] ushrv2.8h, v0.8h, #3 // >> BIT_DEPTH - 3 shl v1.8h, v2.8h, #1 // low (x2, accessing short) @@ -61,7 +64,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table add v1.8h, v0.8h, v2.8h // src[x] + table sqxtun v4.8b, v1.8h // clip + narrow -st1 {v4.8b}, [x0] // store +st1 {v4.8b}, [x0], #8 // store subsw8, w8, #8// done 8 pixels bne 2b subsw7, w7, #1// finished line, prep. new ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: hevc_sao reschedule slightly
ffmpeg | branch: master | J. Dekker | Wed May 25 10:55:34 2022 +0200| [3c694967f862dc5e09921438c6cbd191944ac13c] | committer: J. Dekker lavc/aarch64: hevc_sao reschedule slightly Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3c694967f862dc5e09921438c6cbd191944ac13c --- libavcodec/aarch64/hevcdsp_sao_neon.S | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index efd8112af4..d4decfde3b 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -3,7 +3,7 @@ * * AArch64 NEON optimised SAO functions for HEVC decoding * - * Copyright (c) 2020 Josh Dekker + * Copyright (c) 2022 J. Dekker * * This file is part of FFmpeg. * @@ -24,6 +24,10 @@ #include "libavutil/aarch64/asm.S" +#define MAX_PB_SIZE 64 +#define AV_INPUT_BUFFER_PADDING_SIZE 64 +#define SAO_STRIDE (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) + // void sao_band_filter(uint8_t *_dst, uint8_t *_src, // ptrdiff_t stride_dst, ptrdiff_t stride_src, // int16_t *sao_offset_val, int sao_left_class, @@ -57,6 +61,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 // +---> //i-0 i-1 i-2 i-3 ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); +subsw8, w8, #8 uxtlv0.8h, v2.8b // load src[x] ushrv2.8h, v0.8h, #3 // >> BIT_DEPTH - 3 shl v1.8h, v2.8h, #1 // low (x2, accessing short) @@ -66,7 +71,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 add v1.8h, v0.8h, v2.8h // src[x] + table sqxtun v4.8b, v1.8h // clip + narrow st1 {v4.8b}, [x0], #8 // store -subsw8, w8, #8// done 8 pixels +// done 8 pixels bne 2b subsw7, w7, #1// finished line, prep. new add x0, x0, x2// dst += stride_dst @@ -75,12 +80,11 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 ret endfunc -// ASSUMES STRIDE_SRC = 192 .Lsao_edge_pos: .word 1 // horizontal -.word 192 // vertical -.word 192 + 1 // 45 degree -.word 192 - 1 // 135 degree +.word SAO_STRIDE // vertical +.word SAO_STRIDE + 1 // 45 degree +.word SAO_STRIDE - 1 // 135 degree // ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst, // int16 *sao_offset_val, int eo, int width, int height) @@ -98,7 +102,7 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower moviv2.16b, #2 -mov x15, #192 +mov x15, #SAO_STRIDE // strides between end of line and next src/dst sub x15, x15, x5 // stride_src - width sub x16, x2, x5// stride_dst - width @@ -111,6 +115,7 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 ld1 {v3.16b}, [x11], #16 // load src ld1 {v4.16b}, [x12], #16 // load src_a (prev) ld1 {v5.16b}, [x13], #16 // load src_b (next) +subsx14, x14, #16 cmhiv16.16b, v4.16b, v3.16b// (prev > cur) cmhiv17.16b, v3.16b, v4.16b// (cur > prev) cmhiv18.16b, v5.16b, v3.16b// (next > cur) @@ -130,12 +135,12 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 sqxtun v3.8b, v20.8h sqxtun2 v3.16b, v21.8h st1 {v3.16b}, [x0], #16 -subsx14, x14, #16 // filtered 16 bytes +// filtered 16 bytes b.ne2b // do we have width to filter? // no width to filter, setup next line +subsw6, w6, #1 // filtered line add x11, x11, x15 // stride src to next line add x0, x0, x16// stride dst to next line -subsw6, w6, #1 // filtered line b.ne1b // do we have lines to process? // no lines to filter ret @@ -156,17 +161,17 @@ function ff_hevc_sao_edge_filter_8x8_8_neon, export=1 moviv2.16b, #2 add x16, x0, x2
[FFmpeg-cvslog] lavc/aarch64: fix hevc sao band filter
ffmpeg | branch: master | J. Dekker | Tue Apr 26 09:29:54 2022 +0200| [d957ee34a6ec998ea00d6d07ac687c5d7a9792a2] | committer: J. Dekker lavc/aarch64: fix hevc sao band filter The SAO band filter can be called with non-multiples of 8, we round up to the nearest multiple of 8 to account for this. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d957ee34a6ec998ea00d6d07ac687c5d7a9792a2 --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 10 +- libavcodec/aarch64/hevcdsp_sao_neon.S | 8 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 1e40be740c..c8963e6104 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -75,11 +75,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; -// This function is disabled, as it doesn't handle widths that aren't -// an even multiple of 8 correctly. fate-hevc doesn't exercise that -// for the current size, but if enabled for bigger sizes, the cases -// of non-multiple of 8 seem to arise. -//c->sao_band_filter[0] = ff_hevc_sao_band_filter_8x8_8_neon; +c->sao_band_filter[0] = +c->sao_band_filter[1] = +c->sao_band_filter[2] = +c->sao_band_filter[3] = +c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon; } if (bit_depth == 10) { c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index d523bf584d..e07e0cea2d 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -41,7 +41,11 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 and w10, w10, #0x1F strhw9, [sp, x10, lsl #1] bne 0b +add w6, w6, #7 +bic w6, w6, #7 ld1 {v16.16b-v19.16b}, [sp], #64 +sub x2, x2, x6 +sub x3, x3, x6 moviv20.8h, #1 1: mov w8, w6// beginning of line 2: // Simple layout for accessing 16bit values @@ -52,7 +56,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED| // +---> //i-0 i-1 i-2 i-3 -ld1 {v2.8b}, [x1] // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); +ld1 {v2.8b}, [x1], #8 // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); uxtlv0.8h, v2.8b // load src[x] ushrv2.8h, v0.8h, #3 // >> BIT_DEPTH - 3 shl v1.8h, v2.8h, #1 // low (x2, accessing short) @@ -61,7 +65,7 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 tbx v2.16b, {v16.16b-v19.16b}, v1.16b // table add v1.8h, v0.8h, v2.8h // src[x] + table sqxtun v4.8b, v1.8h // clip + narrow -st1 {v4.8b}, [x0] // store +st1 {v4.8b}, [x0], #8 // store subsw8, w8, #8// done 8 pixels bne 2b subsw7, w7, #1// finished line, prep. new ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: add hevc sao edge 8x8
ffmpeg | branch: master | J. Dekker | Thu Apr 28 14:57:43 2022 +0200| [2e832be322eb456e44b1e928904fa470a0b00a67] | committer: J. Dekker lavc/aarch64: add hevc sao edge 8x8 bench on AWS Graviton: hevc_sao_edge_8x8_8_c: 516.0 hevc_sao_edge_8x8_8_neon: 81.0 Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2e832be322eb456e44b1e928904fa470a0b00a67 --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 3 ++ libavcodec/aarch64/hevcdsp_sao_neon.S | 51 +++ 2 files changed, 54 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index df521bb083..2002530266 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -59,6 +59,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src, int width, int height); void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, int16_t *sao_offset_val, int eo, int width, int height); +void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, + int16_t *sao_offset_val, int eo, int width, int height); av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) { @@ -80,6 +82,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->sao_band_filter[2] = c->sao_band_filter[3] = c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon; +c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8x8_8_neon; c->sao_edge_filter[1] = c->sao_edge_filter[2] = c->sao_edge_filter[3] = diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index 0315c479df..efd8112af4 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -140,3 +140,54 @@ function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 // no lines to filter ret endfunc + +// ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst, +//int16 *sao_offset_val, int eo, int width, int height) +function ff_hevc_sao_edge_filter_8x8_8_neon, export=1 +adr x7, .Lsao_edge_pos +ldr w4, [x7, w4, uxtw #2] +ld1 {v3.8h}, [x3] +mov v3.h[7], v3.h[0] +mov v3.h[0], v3.h[1] +mov v3.h[1], v3.h[2] +mov v3.h[2], v3.h[7] +uzp2v1.16b, v3.16b, v3.16b +uzp1v0.16b, v3.16b, v3.16b +moviv2.16b, #2 +add x16, x0, x2 +lsl x2, x2, #1 +mov x15, #192 +mov x8, x1 +sub x9, x1, x4 +add x10, x1, x4 +lsr w17, w6, #1 +1: ld1 {v3.d}[0], [ x8], x15 +ld1 {v4.d}[0], [ x9], x15 +ld1 {v5.d}[0], [x10], x15 +ld1 {v3.d}[1], [ x8], x15 +ld1 {v4.d}[1], [ x9], x15 +ld1 {v5.d}[1], [x10], x15 +cmhiv16.16b, v4.16b, v3.16b +cmhiv17.16b, v3.16b, v4.16b +cmhiv18.16b, v5.16b, v3.16b +cmhiv19.16b, v3.16b, v5.16b +sub v20.16b, v16.16b, v17.16b +sub v21.16b, v18.16b, v19.16b +add v20.16b, v20.16b, v21.16b +add v20.16b, v20.16b, v2.16b +tbl v16.16b, {v0.16b}, v20.16b +tbl v17.16b, {v1.16b}, v20.16b +uxtlv20.8h, v3.8b +uxtl2 v21.8h, v3.16b +zip1v18.16b, v16.16b, v17.16b +zip2v19.16b, v16.16b, v17.16b +sqadd v20.8h, v18.8h, v20.8h +sqadd v21.8h, v19.8h, v21.8h +sqxtun v6.8b, v20.8h +sqxtun v7.8b, v21.8h +st1 {v6.8b}, [ x0], x2 +st1 {v7.8b}, [x16], x2 +subsx17, x17, #1 +b.ne1b +ret +endfunc ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: add hevc sao edge 16x16
ffmpeg | branch: master | J. Dekker | Thu Apr 28 14:57:33 2022 +0200| [92f67e40170994dcb7a96ae362d95308f6744294] | committer: J. Dekker lavc/aarch64: add hevc sao edge 16x16 bench on AWS Graviton: hevc_sao_edge_16x16_8_c: 1857.0 hevc_sao_edge_16x16_8_neon: 211.0 hevc_sao_edge_32x32_8_c: 7802.2 hevc_sao_edge_32x32_8_neon: 808.2 hevc_sao_edge_48x48_8_c: 16764.2 hevc_sao_edge_48x48_8_neon: 1796.5 hevc_sao_edge_64x64_8_c: 32647.5 hevc_sao_edge_64x64_8_neon: 3118.5 Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=92f67e40170994dcb7a96ae362d95308f6744294 --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 8 +++- libavcodec/aarch64/hevcdsp_sao_neon.S | 66 +++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index c8963e6104..df521bb083 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -57,8 +57,8 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int16_t *sao_offset_val, int sao_left_class, int width, int height); - - +void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, + int16_t *sao_offset_val, int eo, int width, int height); av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) { @@ -80,6 +80,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->sao_band_filter[2] = c->sao_band_filter[3] = c->sao_band_filter[4] = ff_hevc_sao_band_filter_8x8_8_neon; +c->sao_edge_filter[1] = +c->sao_edge_filter[2] = +c->sao_edge_filter[3] = +c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_16x16_8_neon; } if (bit_depth == 10) { c->add_residual[0] = ff_hevc_add_residual_4x4_10_neon; diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S index e07e0cea2d..0315c479df 100644 --- a/libavcodec/aarch64/hevcdsp_sao_neon.S +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S @@ -74,3 +74,69 @@ function ff_hevc_sao_band_filter_8x8_8_neon, export=1 bne 1b ret endfunc + +// ASSUMES STRIDE_SRC = 192 +.Lsao_edge_pos: +.word 1 // horizontal +.word 192 // vertical +.word 192 + 1 // 45 degree +.word 192 - 1 // 135 degree + +// ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst, +// int16 *sao_offset_val, int eo, int width, int height) +function ff_hevc_sao_edge_filter_16x16_8_neon, export=1 +adr x7, .Lsao_edge_pos +ld1 {v3.8h}, [x3] // load sao_offset_val +add w5, w5, #0xF +bic w5, w5, #0xF +ldr w4, [x7, w4, uxtw #2] // stride_src +mov v3.h[7], v3.h[0] // reorder to [1,2,0,3,4] +mov v3.h[0], v3.h[1] +mov v3.h[1], v3.h[2] +mov v3.h[2], v3.h[7] +// split 16bit values into two tables +uzp2v1.16b, v3.16b, v3.16b // sao_offset_val -> upper +uzp1v0.16b, v3.16b, v3.16b // sao_offset_val -> lower +moviv2.16b, #2 +mov x15, #192 +// strides between end of line and next src/dst +sub x15, x15, x5 // stride_src - width +sub x16, x2, x5// stride_dst - width +mov x11, x1// copy base src +1: // new line +mov x14, x5// copy width +sub x12, x11, x4 // src_a (prev) = src - sao_edge_pos +add x13, x11, x4 // src_b (next) = src + sao_edge_pos +2: // process 16 bytes +ld1 {v3.16b}, [x11], #16 // load src +ld1 {v4.16b}, [x12], #16 // load src_a (prev) +ld1 {v5.16b}, [x13], #16 // load src_b (next) +cmhiv16.16b, v4.16b, v3.16b// (prev > cur) +cmhiv17.16b, v3.16b, v4.16b// (cur > prev) +cmhiv18.16b, v5.16b, v3.16b// (next > cur) +cmhiv19.16b, v3.16b, v5.16b// (cur > next) +sub v20.16b, v16.16b, v17.16b // diff0 = CMP(cur, prev) = (cur > prev) - (cur < prev) +sub v21.16b, v18.16b, v19.16b // diff1 = CMP(cur, next) = (cur > next) - (cur < next) +
[FFmpeg-cvslog] checkasm: improve hevc_sao test
ffmpeg | branch: master | J. Dekker | Tue May 17 13:48:23 2022 +0200| [cc679054c715acda9438e566b8de3a9eba421ac3] | committer: J. Dekker checkasm: improve hevc_sao test The HEVC decoder can call these functions with smaller widths than the functions themselves are designed to operate on so we should only check the relevant output Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cc679054c715acda9438e566b8de3a9eba421ac3 --- tests/checkasm/hevc_sao.c | 51 --- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/tests/checkasm/hevc_sao.c b/tests/checkasm/hevc_sao.c index 6b750758e2..4a23010243 100644 --- a/tests/checkasm/hevc_sao.c +++ b/tests/checkasm/hevc_sao.c @@ -78,20 +78,26 @@ static void check_sao_band(HEVCDSPContext h, int bit_depth) for (i = 0; i <= 4; i++) { int block_size = sao_size[i]; +int prev_size = i > 0 ? sao_size[i - 1] : 0; ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL; declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, int16_t *sao_offset_val, int sao_left_class, int width, int height); -randomize_buffers(src0, src1, BUF_SIZE); -randomize_buffers2(offset_val, OFFSET_LENGTH); -memset(dst0, 0, BUF_SIZE); -memset(dst1, 0, BUF_SIZE); - -if (check_func(h.sao_band_filter[i], "hevc_sao_band_%dx%d_%d", block_size, block_size, bit_depth)) { -call_ref(dst0, src0, stride, stride, offset_val, left_class, block_size, block_size); -call_new(dst1, src1, stride, stride, offset_val, left_class, block_size, block_size); -if (memcmp(dst0, dst1, BUF_SIZE)) -fail(); +if (check_func(h.sao_band_filter[i], "hevc_sao_band_%d_%d", block_size, bit_depth)) { + +for (int w = prev_size + 4; w <= block_size; w += 4) { +randomize_buffers(src0, src1, BUF_SIZE); +randomize_buffers2(offset_val, OFFSET_LENGTH); +memset(dst0, 0, BUF_SIZE); +memset(dst1, 0, BUF_SIZE); + +call_ref(dst0, src0, stride, stride, offset_val, left_class, w, block_size); +call_new(dst1, src1, stride, stride, offset_val, left_class, w, block_size); +for (int j = 0; j < block_size; j++) { +if (memcmp(dst0 + j*stride, dst1 + j*stride, w*SIZEOF_PIXEL)) +fail(); +} +} bench_new(dst1, src1, stride, stride, offset_val, left_class, block_size, block_size); } } @@ -109,21 +115,26 @@ static void check_sao_edge(HEVCDSPContext h, int bit_depth) for (i = 0; i <= 4; i++) { int block_size = sao_size[i]; +int prev_size = i > 0 ? sao_size[i - 1] : 0; ptrdiff_t stride = PIXEL_STRIDE*SIZEOF_PIXEL; int offset = (AV_INPUT_BUFFER_PADDING_SIZE + PIXEL_STRIDE)*SIZEOF_PIXEL; declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, int16_t *sao_offset_val, int eo, int width, int height); -randomize_buffers(src0, src1, BUF_SIZE); -randomize_buffers2(offset_val, OFFSET_LENGTH); -memset(dst0, 0, BUF_SIZE); -memset(dst1, 0, BUF_SIZE); - -if (check_func(h.sao_edge_filter[i], "hevc_sao_edge_%dx%d_%d", block_size, block_size, bit_depth)) { -call_ref(dst0, src0 + offset, stride, offset_val, eo, block_size, block_size); -call_new(dst1, src1 + offset, stride, offset_val, eo, block_size, block_size); -if (memcmp(dst0, dst1, BUF_SIZE)) -fail(); +for (int w = prev_size + 4; w <= block_size; w += 4) { +randomize_buffers(src0, src1, BUF_SIZE); +randomize_buffers2(offset_val, OFFSET_LENGTH); +memset(dst0, 0, BUF_SIZE); +memset(dst1, 0, BUF_SIZE); + +if (check_func(h.sao_edge_filter[i], "hevc_sao_edge_%d_%d", block_size, bit_depth)) { +call_ref(dst0, src0 + offset, stride, offset_val, eo, w, block_size); +call_new(dst1, src1 + offset, stride, offset_val, eo, w, block_size); +for (int j = 0; j < block_size; j++) { +if (memcmp(dst0 + j*stride, dst1 + j*stride, w*SIZEOF_PIXEL)) +fail(); +} +} bench_new(dst1, src1 + offset, stride, offset_val, eo, block_size, block_size); } } ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: hevc_add_res add 12bit variants
ffmpeg | branch: master | J. Dekker | Tue Aug 16 07:01:53 2022 +0200| [ce2f47318bdd1586f538059ed36fbf61e825023d] | committer: J. Dekker lavc/aarch64: hevc_add_res add 12bit variants hevc_add_res_4x4_12_c: 46.0 hevc_add_res_4x4_12_neon: 18.7 hevc_add_res_8x8_12_c: 194.7 hevc_add_res_8x8_12_neon: 25.2 hevc_add_res_16x16_12_c: 716.0 hevc_add_res_16x16_12_neon: 69.7 hevc_add_res_32x32_12_c: 3820.7 hevc_add_res_32x32_12_neon: 261.0 Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ce2f47318bdd1586f538059ed36fbf61e825023d --- libavcodec/aarch64/hevcdsp_idct_neon.S| 158 +- libavcodec/aarch64/hevcdsp_init_aarch64.c | 14 +++ 2 files changed, 102 insertions(+), 70 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 484eea8437..124c50998a 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -5,7 +5,7 @@ * * Ported from arm/hevcdsp_idct_neon.S by * Copyright (c) 2020 Reimar Döffinger - * Copyright (c) 2020 Josh Dekker + * Copyright (c) 2020 J. Dekker * * This file is part of FFmpeg. * @@ -37,11 +37,11 @@ const trans, align=4 .short 31, 22, 13, 4 endconst -.macro clip10 in1, in2, c1, c2 -smax\in1, \in1, \c1 -smax\in2, \in2, \c1 -smin\in1, \in1, \c2 -smin\in2, \in2, \c2 +.macro clip2 in1, in2, min, max +smax\in1, \in1, \min +smax\in2, \in2, \min +smin\in1, \in1, \max +smin\in2, \in2, \max .endm function ff_hevc_add_residual_4x4_8_neon, export=1 @@ -64,25 +64,6 @@ function ff_hevc_add_residual_4x4_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_4x4_10_neon, export=1 -mov x12, x0 -ld1 {v0.8h-v1.8h}, [x1] -ld1 {v2.d}[0], [x12], x2 -ld1 {v2.d}[1], [x12], x2 -ld1 {v3.d}[0], [x12], x2 -sqadd v0.8h, v0.8h, v2.8h -ld1 {v3.d}[1], [x12], x2 -moviv4.8h, #0 -sqadd v1.8h, v1.8h, v3.8h -mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF -clip10 v0.8h, v1.8h, v4.8h, v5.8h -st1 {v0.d}[0], [x0], x2 -st1 {v0.d}[1], [x0], x2 -st1 {v1.d}[0], [x0], x2 -st1 {v1.d}[1], [x0], x2 -ret -endfunc - function ff_hevc_add_residual_8x8_8_neon, export=1 add x12, x0, x2 add x2, x2, x2 @@ -103,25 +84,6 @@ function ff_hevc_add_residual_8x8_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_8x8_10_neon, export=1 -add x12, x0, x2 -add x2, x2, x2 -mov x3, #8 -moviv4.8h, #0 -mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF -1: subsx3, x3, #2 -ld1 {v0.8h-v1.8h}, [x1], #32 -ld1 {v2.8h}, [x0] -sqadd v0.8h, v0.8h, v2.8h -ld1 {v3.8h}, [x12] -sqadd v1.8h, v1.8h, v3.8h -clip10 v0.8h, v1.8h, v4.8h, v5.8h -st1 {v0.8h}, [x0], x2 -st1 {v1.8h}, [x12], x2 -bne 1b -ret -endfunc - function ff_hevc_add_residual_16x16_8_neon, export=1 mov x3, #16 add x12, x0, x2 @@ -148,28 +110,6 @@ function ff_hevc_add_residual_16x16_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_16x16_10_neon, export=1 -mov x3, #16 -moviv20.8h, #0 -mvniv21.8h, #0xFC, lsl #8 // movi #0x3FF -add x12, x0, x2 -add x2, x2, x2 -1: subsx3, x3, #2 -ld1 {v16.8h-v17.8h}, [x0] -ld1 {v0.8h-v3.8h}, [x1], #64 -sqadd v0.8h, v0.8h, v16.8h -ld1 {v18.8h-v19.8h}, [x12] -sqadd v1.8h, v1.8h, v17.8h -sqadd v2.8h, v2.8h, v18.8h -sqadd v3.8h, v3.8h, v19.8h -clip10 v0.8h, v1.8h, v20.8h, v21.8h -clip10 v2.8h, v3.8h, v20.8h, v21.8h -st1 {v0.8h-v1.8h}, [x0], x2 -st1 {v2.8h-v3.8h}, [x12], x2 -bne 1b -ret -endfunc - function ff_hevc_add_residual_32x32_8_neon, export=1 add x12, x0, x2 add x2, x2, x2 @@ -209,10 +149,88 @@ function ff_hevc_add_residual_32x32_8_neon, export=1 ret endfunc -function ff_hevc_add_residual_32x32_10_neon, export=1 +.macro add_res bitdepth +function ff_hevc_add_residual_4x4_\bitdepth\()_n
[FFmpeg-cvslog] lavc/aarch64: add hevc horizontal qpel/uni/bi
ffmpeg | branch: master | J. Dekker | Tue Oct 11 09:09:02 2022 +0200| [9bed814e1d44e8374e9a4901e3f9b00ded0716fb] | committer: J. Dekker lavc/aarch64: add hevc horizontal qpel/uni/bi checkasm --benchmark on Ampere Altra (Neoverse N1): put_hevc_qpel_bi_h4_8_c: 170.7 put_hevc_qpel_bi_h4_8_neon: 64.5 put_hevc_qpel_bi_h6_8_c: 373.7 put_hevc_qpel_bi_h6_8_neon: 130.2 put_hevc_qpel_bi_h8_8_c: 662.0 put_hevc_qpel_bi_h8_8_neon: 138.5 put_hevc_qpel_bi_h12_8_c: 1529.5 put_hevc_qpel_bi_h12_8_neon: 422.0 put_hevc_qpel_bi_h16_8_c: 2735.5 put_hevc_qpel_bi_h16_8_neon: 560.5 put_hevc_qpel_bi_h24_8_c: 6015.7 put_hevc_qpel_bi_h24_8_neon: 1636.0 put_hevc_qpel_bi_h32_8_c: 10779.0 put_hevc_qpel_bi_h32_8_neon: 2204.5 put_hevc_qpel_bi_h48_8_c: 24375.0 put_hevc_qpel_bi_h48_8_neon: 4984.0 put_hevc_qpel_bi_h64_8_c: 42768.0 put_hevc_qpel_bi_h64_8_neon: 8795.7 put_hevc_qpel_h4_8_c: 149.0 put_hevc_qpel_h4_8_neon: 55.7 put_hevc_qpel_h6_8_c: 321.2 put_hevc_qpel_h6_8_neon: 106.0 put_hevc_qpel_h8_8_c: 578.7 put_hevc_qpel_h8_8_neon: 133.2 put_hevc_qpel_h12_8_c: 1279.0 put_hevc_qpel_h12_8_neon: 391.7 put_hevc_qpel_h16_8_c: 2286.2 put_hevc_qpel_h16_8_neon: 519.7 put_hevc_qpel_h24_8_c: 5100.7 put_hevc_qpel_h24_8_neon: 1546.2 put_hevc_qpel_h32_8_c: 9022.0 put_hevc_qpel_h32_8_neon: 2060.2 put_hevc_qpel_h48_8_c: 20293.5 put_hevc_qpel_h48_8_neon: 4656.7 put_hevc_qpel_h64_8_c: 36037.0 put_hevc_qpel_h64_8_neon: 8262.7 put_hevc_qpel_uni_h4_8_c: 162.2 put_hevc_qpel_uni_h4_8_neon: 61.7 put_hevc_qpel_uni_h6_8_c: 355.2 put_hevc_qpel_uni_h6_8_neon: 114.2 put_hevc_qpel_uni_h8_8_c: 651.0 put_hevc_qpel_uni_h8_8_neon: 135.7 put_hevc_qpel_uni_h12_8_c: 1412.5 put_hevc_qpel_uni_h12_8_neon: 402.7 put_hevc_qpel_uni_h16_8_c: 2551.0 put_hevc_qpel_uni_h16_8_neon: 533.5 put_hevc_qpel_uni_h24_8_c: 5782.2 put_hevc_qpel_uni_h24_8_neon: 1578.7 put_hevc_qpel_uni_h32_8_c: 10586.5 put_hevc_qpel_uni_h32_8_neon: 2102.2 put_hevc_qpel_uni_h48_8_c: 23812.0 put_hevc_qpel_uni_h48_8_neon: 4739.5 put_hevc_qpel_uni_h64_8_c: 42958.7 put_hevc_qpel_uni_h64_8_neon: 8366.5 Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9bed814e1d44e8374e9a4901e3f9b00ded0716fb --- libavcodec/aarch64/Makefile | 1 + libavcodec/aarch64/hevcdsp_init_aarch64.c | 67 + libavcodec/aarch64/hevcdsp_qpel_neon.S| 484 ++ 3 files changed, 552 insertions(+) diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 9ce21566c6..02fb51c3ab 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -67,4 +67,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ aarch64/vp9mc_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o \ aarch64/hevcdsp_init_aarch64.o \ + aarch64/hevcdsp_qpel_neon.o \ aarch64/hevcdsp_sao_neon.o diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 644cc17715..88a797f393 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -69,6 +69,46 @@ void ff_hevc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrd const int16_t *sao_offset_val, int eo, int width, int height); void ff_hevc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height); +void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, + intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, + ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, + int width); +void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_
[FFmpeg-cvslog] checkasm/hevc_add_res: add 12bit test
ffmpeg | branch: master | J. Dekker | Thu Jun 23 20:04:05 2022 +0200| [ea6ecb12aa9ebfbc985f71938a6cccf5046ca826] | committer: J. Dekker checkasm/hevc_add_res: add 12bit test Also fix the bug where in every other byte only the lower 2 bits were used in the 8bit test. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ea6ecb12aa9ebfbc985f71938a6cccf5046ca826 --- tests/checkasm/hevc_add_res.c | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/checkasm/hevc_add_res.c b/tests/checkasm/hevc_add_res.c index 0c896adaca..f17d121939 100644 --- a/tests/checkasm/hevc_add_res.c +++ b/tests/checkasm/hevc_add_res.c @@ -36,14 +36,14 @@ } \ } while (0) -#define randomize_buffers2(buf, size) \ +#define randomize_buffers2(buf, size, mask) \ do { \ int j;\ for (j = 0; j < size; j++)\ -AV_WN16A(buf + j * 2, rnd() & 0x3FF); \ +AV_WN16A(buf + j * 2, rnd() & mask); \ } while (0) -static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) +static void compare_add_res(int size, ptrdiff_t stride, int overflow_test, int mask) { LOCAL_ALIGNED_32(int16_t, res0, [32 * 32]); LOCAL_ALIGNED_32(int16_t, res1, [32 * 32]); @@ -53,7 +53,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *res, ptrdiff_t stride); randomize_buffers(res0, size); -randomize_buffers2(dst0, size); +randomize_buffers2(dst0, size, mask); if (overflow_test) res0[0] = 0x8000; memcpy(res1, res0, sizeof(*res0) * size); @@ -69,6 +69,7 @@ static void compare_add_res(int size, ptrdiff_t stride, int overflow_test) static void check_add_res(HEVCDSPContext h, int bit_depth) { int i; +int mask = bit_depth == 8 ? 0x : bit_depth == 10 ? 0x03FF : 0x07FF; for (i = 2; i <= 5; i++) { int block_size = 1 << i; @@ -76,9 +77,9 @@ static void check_add_res(HEVCDSPContext h, int bit_depth) ptrdiff_t stride = block_size << (bit_depth > 8); if (check_func(h.add_residual[i - 2], "hevc_add_res_%dx%d_%d", block_size, block_size, bit_depth)) { -compare_add_res(size, stride, 0); +compare_add_res(size, stride, 0, mask); // overflow test for res = -32768 -compare_add_res(size, stride, 1); +compare_add_res(size, stride, 1, mask); } } } @@ -87,7 +88,7 @@ void checkasm_check_hevc_add_res(void) { int bit_depth; -for (bit_depth = 8; bit_depth <= 10; bit_depth++) { +for (bit_depth = 8; bit_depth <= 12; bit_depth++) { HEVCDSPContext h; ff_hevc_dsp_init(, bit_depth); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/aarch64: reformat add_res funcs
ffmpeg | branch: master | J. Dekker | Thu Jun 23 20:04:06 2022 +0200| [aa9eabb7a5283fd90b3274ac4b6ba0d16e42] | committer: J. Dekker lavc/aarch64: reformat add_res funcs Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=aa9eabb7a5283fd90b3274ac4b6ba0d16e42 --- libavcodec/aarch64/hevcdsp_idct_neon.S | 216 - 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S index 0869431294..484eea8437 100644 --- a/libavcodec/aarch64/hevcdsp_idct_neon.S +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -27,21 +27,21 @@ #include "libavutil/aarch64/asm.S" const trans, align=4 -.short 64, 83, 64, 36 -.short 89, 75, 50, 18 -.short 90, 87, 80, 70 -.short 57, 43, 25, 9 -.short 90, 90, 88, 85 -.short 82, 78, 73, 67 -.short 61, 54, 46, 38 -.short 31, 22, 13, 4 +.short 64, 83, 64, 36 +.short 89, 75, 50, 18 +.short 90, 87, 80, 70 +.short 57, 43, 25, 9 +.short 90, 90, 88, 85 +.short 82, 78, 73, 67 +.short 61, 54, 46, 38 +.short 31, 22, 13, 4 endconst .macro clip10 in1, in2, c1, c2 -smax\in1, \in1, \c1 -smax\in2, \in2, \c1 -smin\in1, \in1, \c2 -smin\in2, \in2, \c2 +smax\in1, \in1, \c1 +smax\in2, \in2, \c1 +smin\in1, \in1, \c2 +smin\in2, \in2, \c2 .endm function ff_hevc_add_residual_4x4_8_neon, export=1 @@ -50,13 +50,13 @@ function ff_hevc_add_residual_4x4_8_neon, export=1 ld1 {v2.s}[1], [x0], x2 ld1 {v2.s}[2], [x0], x2 ld1 {v2.s}[3], [x0], x2 -sub x0, x0, x2, lsl #2 -uxtl v6.8h, v2.8b -uxtl2v7.8h, v2.16b -sqaddv0.8h, v0.8h, v6.8h -sqaddv1.8h, v1.8h, v7.8h -sqxtun v0.8b, v0.8h -sqxtun2 v0.16b, v1.8h +sub x0, x0, x2, lsl #2 +uxtlv6.8h, v2.8b +uxtl2 v7.8h, v2.16b +sqadd v0.8h, v0.8h, v6.8h +sqadd v1.8h, v1.8h, v7.8h +sqxtun v0.8b, v0.8h +sqxtun2 v0.16b, v1.8h st1 {v0.s}[0], [x0], x2 st1 {v0.s}[1], [x0], x2 st1 {v0.s}[2], [x0], x2 @@ -70,63 +70,63 @@ function ff_hevc_add_residual_4x4_10_neon, export=1 ld1 {v2.d}[0], [x12], x2 ld1 {v2.d}[1], [x12], x2 ld1 {v3.d}[0], [x12], x2 -sqaddv0.8h, v0.8h, v2.8h +sqadd v0.8h, v0.8h, v2.8h ld1 {v3.d}[1], [x12], x2 -movi v4.8h, #0 -sqaddv1.8h, v1.8h, v3.8h -mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF -clip10 v0.8h, v1.8h, v4.8h, v5.8h -st1 {v0.d}[0], [x0], x2 -st1 {v0.d}[1], [x0], x2 -st1 {v1.d}[0], [x0], x2 -st1 {v1.d}[1], [x0], x2 +moviv4.8h, #0 +sqadd v1.8h, v1.8h, v3.8h +mvniv5.8h, #0xFC, lsl #8 // movi #0x3FF +clip10 v0.8h, v1.8h, v4.8h, v5.8h +st1 {v0.d}[0], [x0], x2 +st1 {v0.d}[1], [x0], x2 +st1 {v1.d}[0], [x0], x2 +st1 {v1.d}[1], [x0], x2 ret endfunc function ff_hevc_add_residual_8x8_8_neon, export=1 -add x12, x0, x2 -add x2, x2, x2 -mov x3, #8 -1: subs x3, x3, #2 -ld1 {v2.d}[0], [x0] -ld1 {v2.d}[1],[x12] -uxtl v3.8h, v2.8b +add x12, x0, x2 +add x2, x2, x2 +mov x3, #8 +1: subsx3, x3, #2 +ld1 {v2.d}[0], [x0] +ld1 {v2.d}[1], [x12] +uxtlv3.8h, v2.8b ld1 {v0.8h-v1.8h}, [x1], #32 -uxtl2v2.8h, v2.16b -sqaddv0.8h, v0.8h, v3.8h -sqaddv1.8h, v1.8h, v2.8h -sqxtun v0.8b, v0.8h -sqxtun2 v0.16b, v1.8h -st1 {v0.d}[0], [x0], x2 -st1 {v0.d}[1],[x12], x2 -bne 1b +uxtl2 v2.8h, v2.16b +sqadd v0.8h, v0.8h, v3.8h +sqadd v1.8h, v1.8h, v2.8h +sqxtun v0.8b, v0.8h +sqxtun2
[FFmpeg-cvslog] tools: add general_assembly.pl
ffmpeg | branch: master | J. Dekker | Wed Feb 16 02:49:29 2022 +0100| [926059dbf36c00807720a9160a43b4fa13f0d6ae] | committer: Anton Khirnov tools: add general_assembly.pl This script generates the current general assembly voters according to the criteria of '20 commits in the last 36 months'. Signed-off-by: J. Dekker Signed-off-by: Anton Khirnov > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=926059dbf36c00807720a9160a43b4fa13f0d6ae --- doc/dev_community/community.md | 3 +++ tools/general_assembly.pl | 40 2 files changed, 43 insertions(+) diff --git a/doc/dev_community/community.md b/doc/dev_community/community.md index 21e08e20e3..516ca5c05e 100644 --- a/doc/dev_community/community.md +++ b/doc/dev_community/community.md @@ -25,6 +25,9 @@ proposal by a member of the General Assembly. They are part of the GA for two years, after which they need a confirmation by the GA. +A script to generate the current members of the general assembly (minus members +voted in) can be found in `tools/general_assembly.pl`. + ## Voting Voting is done using a ranked voting system, currently running on https://vote.ffmpeg.org/ . diff --git a/tools/general_assembly.pl b/tools/general_assembly.pl new file mode 100644 index 00..898a6262ef --- /dev/null +++ b/tools/general_assembly.pl @@ -0,0 +1,40 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use POSIX qw(strftime); +use Encode qw(decode); +use Data::Dumper; + +sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; + +my @shortlog = split /\n/, decode('UTF-8', `git log --pretty=format:"%aN <%aE>" --since="last 36 months" | sort | uniq -c | sort -r`, Encode::FB_CROAK); +my %assembly = (); + +foreach my $line (@shortlog) { +my ($count, $name, $email) = $line =~ m/^ *(\d+) *(.*?) <(.*?)>/; +if ($count < 20) { +next; +} + +$name = trim $name; +if ($count < 50) { +my $true = 0; +my @commits = split /(^|\n)commit [a-z0-9]{40}(\n|$)/, decode('UTF-8', `git log --name-only --use-mailmap --author="$email" --since="last 36 months"`, Encode::FB_CROAK); +foreach my $commit (@commits) { +$true++; # if ($commit =~ /\n[\w\/]+\.(c|h|S|asm|texi)/); +} + +if ($true < 20) { +next; +} +} + +$assembly{$name} = $email; +} + +printf("# %s %s", strftime("%Y-%m-%d", localtime), decode('UTF-8', `git rev-parse HEAD`, Encode::FB_CROAK)); +foreach my $email (sort values %assembly) { +printf("%s\n", $email); +} ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] tests/checkasm: add exclude_guest for non-x86 linux perf
ffmpeg | branch: master | J. Dekker | Tue Apr 9 16:49:11 2024 +0200| [985fdf8e3d616633f1dc13920491bab45b1aa758] | committer: J. Dekker tests/checkasm: add exclude_guest for non-x86 linux perf The exclude_guest option only has an effect on x86. Omitting 'exclude_guest' defaults to zero which implies that you can count guest events should you run one. Some non-x86 kernels just ignore it, while others (e.g. the Asahi Linux kernels) require the user to explicitly set the option to 1, i.e. the only behaviour that makes sense when counting guest events isn't supported. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=985fdf8e3d616633f1dc13920491bab45b1aa758 --- tests/checkasm/checkasm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index dcd2fd6957..8be6cb0f55 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -742,6 +742,9 @@ static int bench_init_linux(void) .disabled = 1, // start counting only on demand .exclude_kernel = 1, .exclude_hv = 1, +#if !ARCH_X86 +.exclude_guest = 1, +#endif }; printf("benchmarking with Linux Perf Monitoring API\n"); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] checkasm/h264dsp: support checking more idct depths
ffmpeg | branch: master | J. Dekker | Wed Apr 24 16:09:44 2024 +0200| [4ced36744ee0bea5fb7a20d1b2926c588f89ea0b] | committer: J. Dekker checkasm/h264dsp: support checking more idct depths Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4ced36744ee0bea5fb7a20d1b2926c588f89ea0b --- tests/checkasm/h264dsp.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/h264dsp.c b/tests/checkasm/h264dsp.c index 0f484e3f43..30ac81e71e 100644 --- a/tests/checkasm/h264dsp.c +++ b/tests/checkasm/h264dsp.c @@ -173,6 +173,7 @@ static void dct8x8(int16_t *coef, int bit_depth) static void check_idct(void) { +static const int depths[5] = { 8, 9, 10, 12, 14 }; LOCAL_ALIGNED_16(uint8_t, src, [8 * 8 * 2]); LOCAL_ALIGNED_16(uint8_t, dst, [8 * 8 * 2]); LOCAL_ALIGNED_16(uint8_t, dst0, [8 * 8 * 2]); @@ -181,10 +182,11 @@ static void check_idct(void) LOCAL_ALIGNED_16(int16_t, subcoef0, [8 * 8 * 2]); LOCAL_ALIGNED_16(int16_t, subcoef1, [8 * 8 * 2]); H264DSPContext h; -int bit_depth, sz, align, dc; +int bit_depth, sz, align, dc, i; declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, int16_t *block, int stride); -for (bit_depth = 8; bit_depth <= 10; bit_depth++) { +for (i = 0; i < FF_ARRAY_ELEMS(depths); i++) { +bit_depth = depths[i]; ff_h264dsp_init(, bit_depth, 1); for (sz = 4; sz <= 8; sz += 4) { randomize_buffers(); ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avfilter/riscv: build afir only if required
ffmpeg | branch: master | J. Dekker | Wed Apr 24 16:09:43 2024 +0200| [ca583b22e49b6523f2f8c83da3e2ed0ccaaecacf] | committer: J. Dekker avfilter/riscv: build afir only if required Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=ca583b22e49b6523f2f8c83da3e2ed0ccaaecacf --- libavfilter/riscv/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile index 0b968a9c0d..277dde2aed 100644 --- a/libavfilter/riscv/Makefile +++ b/libavfilter/riscv/Makefile @@ -1,2 +1,2 @@ -OBJS += riscv/af_afir_init.o -RVV-OBJS += riscv/af_afir_rvv.o +OBJS-$(CONFIG_AFIR_FILTER) += riscv/af_afir_init.o +RVV-OBJS-$(CONFIG_AFIR_FILTER) += riscv/af_afir_rvv.o ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] ffbuild/libversion.sh: add shebang
ffmpeg | branch: master | J. Dekker | Tue Apr 9 11:52:00 2024 +0200| [fcfd17dbb4a6cf270cdd82e91c21a5efdc878d12] | committer: J. Dekker ffbuild/libversion.sh: add shebang The implicit interpreter is dependent on the environment, and isn't guaranteed to be /bin/sh. Some packagers call this script directly, and in certain environments such as containers using qemu-user through binfmt_misc emulation on Linux it doesn't fallback to /bin/sh. To fix these cases we add the interpreter explicitly. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=fcfd17dbb4a6cf270cdd82e91c21a5efdc878d12 --- ffbuild/libversion.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ffbuild/libversion.sh b/ffbuild/libversion.sh index a94ab58057..ecaa90cde6 100755 --- a/ffbuild/libversion.sh +++ b/ffbuild/libversion.sh @@ -1,3 +1,4 @@ +#!/bin/sh toupper(){ echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ } ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] configure, etc: switch to shebang without space
ffmpeg | branch: master | J. Dekker | Tue Apr 9 15:31:13 2024 +0200| [67e2f8b6bf5a44fa8083f90096de6131601879b4] | committer: J. Dekker configure, etc: switch to shebang without space Note that the config.sh file is left without a shebang, this file is supposed to be sourced into the current environment. This commit is purely cosmetic. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=67e2f8b6bf5a44fa8083f90096de6131601879b4 --- configure | 2 +- doc/texidep.pl| 2 +- tests/fate-run.sh | 2 +- tests/fate.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configure b/configure index 7c22772485..55f1fc354d 100755 --- a/configure +++ b/configure @@ -4737,7 +4737,7 @@ chmod +x $TMPE # make sure we can execute files in $TMPDIR cat > $TMPSH 2>> $logfile <> $logfile 2>&1 if ! $TMPSH >> $logfile 2>&1; then diff --git a/doc/texidep.pl b/doc/texidep.pl index 099690378e..33e6c7c53e 100644 --- a/doc/texidep.pl +++ b/doc/texidep.pl @@ -1,4 +1,4 @@ -#! /usr/bin/env perl +#!/usr/bin/env perl # This script will print the dependency of a Texinfo file to stdout. # texidep.pl diff --git a/tests/fate-run.sh b/tests/fate-run.sh index 9863e4f2d9..6ae0320c60 100755 --- a/tests/fate-run.sh +++ b/tests/fate-run.sh @@ -1,4 +1,4 @@ -#! /bin/sh +#!/bin/sh export LC_ALL=C diff --git a/tests/fate.sh b/tests/fate.sh index 07908be3a5..c5ee18de80 100755 --- a/tests/fate.sh +++ b/tests/fate.sh @@ -1,4 +1,4 @@ -#! /bin/sh +#!/bin/sh config=$1 ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] configure: simplify bigendian check
ffmpeg | branch: master | J. Dekker | Wed Mar 27 05:53:16 2024 +0100| [3090106635b32ee042fe3bb914b7c7863daa1d11] | committer: J. Dekker configure: simplify bigendian check The preferred way to use LTO is --enable-lto but often times packagers still end up with -flto in cflags for various reasons. Using grep on binary object files is brittle and relies on specific object representation, which in the case of LLVM bitcode, debug information or other intermediary formats can fail silently. This patch changes the check to a more commonly used define for GCC style compilers. More checks may be needed to cover other potential compilers that don't use the __BYTE_ORDER__ define. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3090106635b32ee042fe3bb914b7c7863daa1d11 --- configure | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/configure b/configure index f511fbae49..7c22772485 100755 --- a/configure +++ b/configure @@ -6120,11 +6120,7 @@ extern_prefix=${sym%%ff_extern*} check_cc pragma_deprecated "" '_Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")' -# The global variable ensures the bits appear unchanged in the object file. -test_cc <https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avcodec/aarch64/hevc: add luma deblock NEON
ffmpeg | branch: master | J. Dekker | Tue Feb 13 01:09:28 2024 +0100| [570052cd2a38200ae6aca52e817517513812ec56] | committer: J. Dekker avcodec/aarch64/hevc: add luma deblock NEON Benched using single-threaded full decode on an Ampere Altra. Bpp Before After Speedup 8 73,3s 65,2s 1.124x 10 114,2s 104,0s 1.098x 12 125,8s 115,7s 1.087x Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=570052cd2a38200ae6aca52e817517513812ec56 --- libavcodec/aarch64/hevcdsp_deblock_neon.S | 417 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 18 ++ 2 files changed, 435 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S b/libavcodec/aarch64/hevcdsp_deblock_neon.S index 8227f65649..581056a91e 100644 --- a/libavcodec/aarch64/hevcdsp_deblock_neon.S +++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S @@ -181,3 +181,420 @@ hevc_h_loop_filter_chroma 12 hevc_v_loop_filter_chroma 8 hevc_v_loop_filter_chroma 10 hevc_v_loop_filter_chroma 12 + +.macro hevc_loop_filter_luma_body bitdepth +function hevc_loop_filter_luma_body_\bitdepth\()_neon, export=0 +.if \bitdepth > 8 +lsl w2, w2, #(\bitdepth - 8) // beta <<= BIT_DEPTH - 8 +.else +uxtlv0.8h, v0.8b +uxtlv1.8h, v1.8b +uxtlv2.8h, v2.8b +uxtlv3.8h, v3.8b +uxtlv4.8h, v4.8b +uxtlv5.8h, v5.8b +uxtlv6.8h, v6.8b +uxtlv7.8h, v7.8b +.endif +ldr w7, [x3] // tc[0] +ldr w8, [x3, #4] // tc[1] +dup v18.4h, w7 +dup v19.4h, w8 +trn1v18.2d, v18.2d, v19.2d +.if \bitdepth > 8 +shl v18.8h, v18.8h, #(\bitdepth - 8) +.endif +dup v27.8h, w2 // beta +// tc25 +shl v19.8h, v18.8h, #2 // * 4 +add v19.8h, v19.8h, v18.8h // (tc * 5) +srshr v19.8h, v19.8h, #1 // (tc * 5 + 1) >> 1 +sshrv17.8h, v27.8h, #2 // beta2 + +// beta_2 check +// dp0 = abs(P2 - 2 * P1 + P0) +add v22.8h, v3.8h, v1.8h +shl v23.8h, v2.8h, #1 +sabdv30.8h, v22.8h, v23.8h +// dq0 = abs(Q2 - 2 * Q1 + Q0) +add v21.8h, v6.8h, v4.8h +shl v26.8h, v5.8h, #1 +sabdv31.8h, v21.8h, v26.8h +// d0 = dp0 + dq0 +add v20.8h, v30.8h, v31.8h +shl v25.8h, v20.8h, #1 +// (d0 << 1) < beta_2 +cmgtv23.8h, v17.8h, v25.8h + +// beta check +// d0 + d3 < beta +mov x9, #0x +dup v24.2d, x9 +and v25.16b, v24.16b, v20.16b +addpv25.8h, v25.8h, v25.8h // 1+0 0+1 1+0 0+1 +addpv25.4h, v25.4h, v25.4h // 1+0+0+1 1+0+0+1 +cmgtv25.4h, v27.4h, v25.4h // lower/upper mask in h[0/1] +mov w9, v25.s[0] +cmp w9, #0 +sxtlv26.4s, v25.4h +sxtlv16.2d, v26.2s // full skip mask +b.eq3f // skip both blocks + +// TODO: we can check the full skip mask with the weak/strong mask to +// potentially skip weak or strong calculation entirely if we only have one + +// beta_3 check +// abs(P3 - P0) + abs(Q3 - Q0) < beta_3 +sshrv17.8h, v17.8h, #1 // beta_3 +sabdv20.8h, v0.8h, v3.8h +sabav20.8h, v7.8h, v4.8h +cmgtv21.8h, v17.8h, v20.8h + +and v23.16b, v23.16b, v21.16b + +// tc25 check +// abs(P0 - Q0) < tc25 +sabdv20.8h, v3.8h, v4.8h +cmgtv21.8h, v19.8h, v20.8h + +and v23.16b, v23.16b, v21.16b + +// Generate low/high line max from lines 0/3/4/7 +// mask out lines 2/3/5/6 +not v20.16b, v24.16b // 0x +orr v23.16b, v23.16b, v20.16b + +// generate weak/strong mask +uminp v23.8h, v23.8h, v23.8h // extend to singles +sxtlv23.4s, v23.4h +uminp v26.4s, v23.4s, v23.4s // check lines +// extract to gpr +ext v25.16b, v26.16b, v26.16b, #2 +zip1v17.4s, v26.4s, v26.4s +mov w12, v25.s[0] +mov w11, #0x +mov w13, #0x +// -> strong strong +// -> strong weak +// -> weak strong +// -> weak weak +cmp w12, w13 +b.hi
[FFmpeg-cvslog] avdevice: deprecate opengl outdev
ffmpeg | branch: master | J. Dekker | Tue Feb 13 08:34:25 2024 +0100| [e4c0cdf8df96047ee195cc594a2a93443e2aa25d] | committer: J. Dekker avdevice: deprecate opengl outdev Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=e4c0cdf8df96047ee195cc594a2a93443e2aa25d --- doc/outdevs.texi| 2 +- libavdevice/opengl_enc.c| 11 +++ libavdevice/version_major.h | 2 ++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/outdevs.texi b/doc/outdevs.texi index f0484bbf8f..941429a8c8 100644 --- a/doc/outdevs.texi +++ b/doc/outdevs.texi @@ -302,7 +302,7 @@ ffmpeg -re -i INPUT -c:v rawvideo -pix_fmt bgra -f fbdev /dev/fb0 See also @url{http://linux-fbdev.sourceforge.net/}, and fbset(1). @section opengl -OpenGL output device. +OpenGL output device. Deprecated and will be removed. To enable this output device you need to configure FFmpeg with @code{--enable-opengl}. diff --git a/libavdevice/opengl_enc.c b/libavdevice/opengl_enc.c index b2ac6eb16a..69de6fad03 100644 --- a/libavdevice/opengl_enc.c +++ b/libavdevice/opengl_enc.c @@ -224,6 +224,8 @@ typedef struct OpenGLContext { int picture_height;///< Rendered height int window_width; int window_height; + +int warned; } OpenGLContext; static const struct OpenGLFormatDesc { @@ -1060,6 +1062,15 @@ static av_cold int opengl_write_header(AVFormatContext *h) AVStream *st; int ret; +if (!opengl->warned) { +av_log(opengl, AV_LOG_WARNING, +"The opengl output device is deprecated due to being fundamentally incompatible with libavformat API. " +"For monitoring purposes in ffmpeg you can output to a file or use pipes and a video player.\n" +"Example: ffmpeg -i INPUT -f nut -c:v rawvideo - | ffplay -\n" +); +opengl->warned = 1; +} + if (h->nb_streams != 1 || par->codec_type != AVMEDIA_TYPE_VIDEO || (par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME && par->codec_id != AV_CODEC_ID_RAWVIDEO)) { diff --git a/libavdevice/version_major.h b/libavdevice/version_major.h index 9f7b79b2ee..da5854ed4c 100644 --- a/libavdevice/version_major.h +++ b/libavdevice/version_major.h @@ -35,5 +35,7 @@ // reminder to remove the bktr device on next major bump #define FF_API_BKTR_DEVICE (LIBAVDEVICE_VERSION_MAJOR < 62) +// reminder to remove the opengl device on next major bump +#define FF_API_OPENGL_DEVICE (LIBAVDEVICE_VERSION_MAJOR < 62) #endif /* AVDEVICE_VERSION_MAJOR_H */ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avdevice: deprecate sdl outdev
ffmpeg | branch: master | J. Dekker | Tue Feb 13 08:34:26 2024 +0100| [2b17a74df5fbbc87cdf7a0a784e2e088ab4afd3c] | committer: J. Dekker avdevice: deprecate sdl outdev Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2b17a74df5fbbc87cdf7a0a784e2e088ab4afd3c --- doc/outdevs.texi| 8 +++- libavdevice/sdl2.c | 10 ++ libavdevice/version_major.h | 2 ++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/outdevs.texi b/doc/outdevs.texi index 941429a8c8..9ee857528e 100644 --- a/doc/outdevs.texi +++ b/doc/outdevs.texi @@ -408,7 +408,13 @@ ffmpeg -i INPUT -f pulse "stream name" @section sdl -SDL (Simple DirectMedia Layer) output device. +SDL (Simple DirectMedia Layer) output device. Deprecated and will be removed. + +For monitoring purposes in FFmpeg, pipes and a video player such as ffplay can be used: + +@example +ffmpeg -i INPUT -f nut -c:v rawvideo - | ffplay - +@end example "sdl2" can be used as alias for "sdl". diff --git a/libavdevice/sdl2.c b/libavdevice/sdl2.c index 342a253dc0..ec3c3d19b5 100644 --- a/libavdevice/sdl2.c +++ b/libavdevice/sdl2.c @@ -51,6 +51,7 @@ typedef struct { SDL_Rect texture_rect; int inited; +int warned; } SDLContext; static const struct sdl_texture_format_entry { @@ -165,6 +166,15 @@ static int sdl2_write_header(AVFormatContext *s) int i, ret = 0; int flags = 0; +if (!sdl->warned) { +av_log(sdl, AV_LOG_WARNING, +"The sdl output device is deprecated due to being fundamentally incompatible with libavformat API. " +"For monitoring purposes in ffmpeg you can output to a file or use pipes and a video player.\n" +"Example: ffmpeg -i INPUT -f nut -c:v rawvideo - | ffplay -\n" +); +sdl->warned = 1; +} + if (!sdl->window_title) sdl->window_title = av_strdup(s->url); diff --git a/libavdevice/version_major.h b/libavdevice/version_major.h index da5854ed4c..6e04e0939d 100644 --- a/libavdevice/version_major.h +++ b/libavdevice/version_major.h @@ -37,5 +37,7 @@ #define FF_API_BKTR_DEVICE (LIBAVDEVICE_VERSION_MAJOR < 62) // reminder to remove the opengl device on next major bump #define FF_API_OPENGL_DEVICE (LIBAVDEVICE_VERSION_MAJOR < 62) +// reminder to remove the sdl2 device on next major bump +#define FF_API_SDL2_DEVICE (LIBAVDEVICE_VERSION_MAJOR < 62) #endif /* AVDEVICE_VERSION_MAJOR_H */ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] checkasm/hevc_deblock: add luma and chroma full
ffmpeg | branch: master | J. Dekker | Wed Jan 24 12:57:04 2024 +0100| [07cc8f6b3cd463b714aba1f0612c04d21bf8af16] | committer: J. Dekker checkasm/hevc_deblock: add luma and chroma full Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=07cc8f6b3cd463b714aba1f0612c04d21bf8af16 --- tests/checkasm/hevc_deblock.c | 246 -- 1 file changed, 215 insertions(+), 31 deletions(-) diff --git a/tests/checkasm/hevc_deblock.c b/tests/checkasm/hevc_deblock.c index 66fc8d5646..91e57f5cf5 100644 --- a/tests/checkasm/hevc_deblock.c +++ b/tests/checkasm/hevc_deblock.c @@ -19,9 +19,9 @@ #include #include "libavutil/intreadwrite.h" +#include "libavutil/macros.h" #include "libavutil/mem_internal.h" -#include "libavcodec/avcodec.h" #include "libavcodec/hevcdsp.h" #include "checkasm.h" @@ -29,10 +29,11 @@ static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff }; #define SIZEOF_PIXEL ((bit_depth + 7) / 8) -#define BUF_STRIDE (8 * 2) -#define BUF_LINES (8) -#define BUF_OFFSET (BUF_STRIDE * BUF_LINES) -#define BUF_SIZE (BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2) +#define BUF_STRIDE (16 * 2) +#define BUF_LINES (16) +// large buffer sizes based on high bit depth +#define BUF_OFFSET (2 * BUF_STRIDE * BUF_LINES) +#define BUF_SIZE (2 * BUF_STRIDE * BUF_LINES + BUF_OFFSET * 2) #define randomize_buffers(buf0, buf1, size) \ do {\ @@ -45,57 +46,240 @@ static const uint32_t pixel_mask[3] = { 0x, 0x03ff03ff, 0x0fff0fff }; } \ } while (0) -static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth) +static void check_deblock_chroma(HEVCDSPContext *h, int bit_depth, int c) { -int32_t tc[2] = { 0, 0 }; +// see tctable[] in hevc_filter.c, we check full range +int32_t tc[2] = { rnd() % 25, rnd() % 25 }; // no_p, no_q can only be { 0,0 } for the simpler assembly (non *_c // variant) functions, see deblocking_filter_CTB() in hevc_filter.c -uint8_t no_p[2] = { 0, 0 }; -uint8_t no_q[2] = { 0, 0 }; +uint8_t no_p[2] = { rnd() & c, rnd() & c }; +uint8_t no_q[2] = { rnd() & c, rnd() & c }; LOCAL_ALIGNED_32(uint8_t, buf0, [BUF_SIZE]); LOCAL_ALIGNED_32(uint8_t, buf1, [BUF_SIZE]); declare_func(void, uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q); -if (check_func(h->hevc_h_loop_filter_chroma, "hevc_h_loop_filter_chroma%d", bit_depth)) { -for (int i = 0; i < 4; i++) { -randomize_buffers(buf0, buf1, BUF_SIZE); -// see betatable[] in hevc_filter.c -tc[0] = (rnd() & 63) + (rnd() & 1); -tc[1] = (rnd() & 63) + (rnd() & 1); +if (check_func(c ? h->hevc_h_loop_filter_chroma_c : h->hevc_h_loop_filter_chroma, + "hevc_h_loop_filter_chroma%d%s", bit_depth, c ? "_full" : "")) +{ +randomize_buffers(buf0, buf1, BUF_SIZE); -call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); -call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); +call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); +call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); +if (memcmp(buf0, buf1, BUF_SIZE)) +fail(); +bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); +} + +if (check_func(c ? h->hevc_v_loop_filter_chroma_c : h->hevc_v_loop_filter_chroma, + "hevc_v_loop_filter_chroma%d%s", bit_depth, c ? "_full" : "")) +{ +randomize_buffers(buf0, buf1, BUF_SIZE); + +call_ref(buf0 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); +call_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); +if (memcmp(buf0, buf1, BUF_SIZE)) +fail(); +bench_new(buf1 + BUF_OFFSET, BUF_STRIDE, tc, no_p, no_q); +} +} + +#define P3 buf[-4 * xstride] +#define P2 buf[-3 * xstride] +#define P1 buf[-2 * xstride] +#define P0 buf[-1 * xstride] +#define Q0 buf[0 * xstride] +#define Q1 buf[1 * xstride] +#define Q2 buf[2 * xstride] +#define Q3 buf[3 * xstride] + +#define TC25(x) ((tc[x] * 5 + 1) >> 1) +#define MASK(x) (uint16_t)(x & ((1 << (bit_depth)) - 1)) +#define GET(x) ((SIZEOF_PIXEL == 1) ? *(uint8_t*)() : *(uint16_t*)()) +#define SET(x, y) do { \ +uint16_t z = MASK(y); \ +if (SIZEOF_PIXEL == 1) \ +*(uint8_t*)() = z; \ +else \ +*(uint16_t*)() = z; \ +} while (0) +#define RANDCLIP(x, diff) av_clip(GET(x) - (diff), 0, \ +(1 << (bit_depth)) - 1) + rnd() % FFMAX(2 * (diff), 1) + +// NOTE: this function doesn't work 'correctly' in that it won't always choose +// strong/strong or wea
[FFmpeg-cvslog] checkasm: add runs argument to adjust during bench
ffmpeg | branch: master | J. Dekker | Mon May 13 15:04:31 2024 +0200| [b1adf6d1d02c2be7418ab496486a350724740907] | committer: J. Dekker checkasm: add runs argument to adjust during bench Some timers on certain device and test combinations can produce noisy results, affecting the reliability of performance measurements. One notable example of this is the Canaan K230 RISC-V development board. An option to adjust the number of samples by an exponent (--runs) has been added, allowing developers to increase the sample count for more reliable results. Signed-off-by: J. Dekker > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=b1adf6d1d02c2be7418ab496486a350724740907 --- tests/checkasm/checkasm.c | 16 +++- tests/checkasm/checkasm.h | 7 --- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 31ca9f6e2b..669f2be9c1 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -72,6 +72,9 @@ void (*checkasm_checked_call)(void *func, int dummy, ...) = checkasm_checked_call_novfp; #endif +/* Trade-off between speed and accuracy */ +uint64_t bench_runs = 1U << 10; + /* List of tests to invoke */ static const struct { const char *name; @@ -820,7 +823,7 @@ static void bench_uninit(void) static int usage(const char *path) { fprintf(stderr, -"Usage: %s [--bench] [--test=] [--verbose] [seed]\n", +"Usage: %s [--bench] [--runs=] [--test=] [--verbose] [seed]\n", path); return 1; } @@ -867,6 +870,17 @@ int main(int argc, char *argv[]) state.test_name = arg + 7; } else if (!strcmp(arg, "--verbose") || !strcmp(arg, "-v")) { state.verbose = 1; +} else if (!strncmp(arg, "--runs=", 7)) { +l = strtoul(arg + 7, , 10); +if (*end == '\0') { +if (l > 30) { +fprintf(stderr, "checkasm: error: runs exponent must be within the range 0 <= 30\n"); +usage(argv[0]); +} +bench_runs = 1U << l; +} else { +return usage(argv[0]); +} } else if ((l = strtoul(arg, , 10)) <= UINT_MAX && *end == '\0') { seed = l; diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 07fcc751ff..e05053cbf6 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -167,7 +167,7 @@ extern AVLFG checkasm_lfg; static av_unused void *func_ref, *func_new; -#define BENCH_RUNS 1000 /* Trade-off between accuracy and speed */ +extern uint64_t bench_runs; /* Decide whether or not the specified function needs to be tested */ #define check_func(func, ...) (checkasm_save_context(), func_ref = checkasm_check_func((func_new = func), __VA_ARGS__)) @@ -336,10 +336,11 @@ typedef struct CheckasmPerf { av_unused const int sysfd = perf->sysfd;\ func_type *tfunc = func_new;\ uint64_t tsum = 0;\ -int ti, tcount = 0;\ +uint64_t ti, tcount = 0;\ uint64_t t = 0; \ +const uint64_t truns = bench_runs;\ checkasm_set_signal_handler_state(1);\ -for (ti = 0; ti < BENCH_RUNS; ti++) {\ +for (ti = 0; ti < truns; ti++) {\ PERF_START(t);\ tfunc(__VA_ARGS__);\ tfunc(__VA_ARGS__);\ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".