PR #21475 opened by hezuoqiang URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21475 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21475.patch
This optimization targets ff_nal_find_startcode in libavutil/nal, which directly returns pointers to complete startcode sequences (00 00 01) used by the H.264 demuxer, distinct from ff_startcode_find_candidate which only locates zero bytes and requires upper-layer validation. Technical Approach: - Uses NEON SIMD to detect "00" byte-pair patterns instead of single zeros - Processes data in 64-byte vector blocks for maximum throughput - Falls back to existing C implementation for small buffers or when NEON is unavailable Performance Analysis (22.88 MB 1080p H.264 video): - Single zero bytes in stream: 95,673 (98.1% false positive rate) - Valid NALU startcodes: 1,224 - Using "00" pattern reduces false positives from 98.1% to 22.8% - Only 22.8% of 64-byte blocks require detailed checking - 77.2% of blocks can be skipped entirely after NEON fast-path Benchmark Results (1000 iterations, Raspberry Pi 5 - Cortex-A76): - Baseline (ff_startcode_find_candidate + C validation): 5,454,680 μs - NEON optimized (ff_nal_find_startcode_neon): 1,741,280 μs - Speedup: 3.13x From c0ba98d30a1428ef12ef7dd9a4935ad6d3195703 Mon Sep 17 00:00:00 2001 From: Zuoqiang He <[email protected]> Date: Fri, 16 Jan 2026 00:31:58 +0800 Subject: [PATCH] libavutil/nal: optimize NAL startcode search with ARM NEON MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This optimization targets ff_nal_find_startcode in libavutil/nal, which directly returns pointers to complete startcode sequences (00 00 01) used by the H.264 demuxer, distinct from ff_startcode_find_candidate which only locates zero bytes and requires upper-layer validation. Technical Approach: - Uses NEON SIMD to detect "00" byte-pair patterns instead of single zeros - Processes data in 64-byte vector blocks for maximum throughput - Falls back to existing C implementation for small buffers or when NEON is unavailable Performance Analysis (22.88 MB 1080p H.264 video): - Single zero bytes in stream: 95,673 (98.1% false positive rate) - Valid NALU startcodes: 1,224 - Using "00" pattern reduces false positives from 98.1% to 22.8% - Only 22.8% of 64-byte blocks require detailed checking - 77.2% of blocks can be skipped entirely after NEON fast-path Benchmark Results (1000 iterations, Raspberry Pi 5 - Cortex-A76): - Baseline (ff_startcode_find_candidate + C validation): 5,454,680 μs - NEON optimized (ff_nal_find_startcode_neon): 1,741,280 μs - Speedup: 3.13x Signed-off-by: Zuoqiang He <[email protected]> --- libavformat/nal.c | 45 +-------- libavutil/Makefile | 1 + libavutil/aarch64/Makefile | 1 + libavutil/aarch64/nal.S | 169 +++++++++++++++++++++++++++++++ libavutil/nal.c | 100 +++++++++++++++++++ libavutil/nal.h | 60 +++++++++++ tests/checkasm/Makefile | 1 + tests/checkasm/checkasm.c | 1 + tests/checkasm/checkasm.h | 1 + tests/checkasm/nal.c | 197 +++++++++++++++++++++++++++++++++++++ tests/fate/checkasm.mak | 1 + 11 files changed, 536 insertions(+), 41 deletions(-) create mode 100644 libavutil/aarch64/nal.S create mode 100644 libavutil/nal.c create mode 100644 libavutil/nal.h create mode 100644 tests/checkasm/nal.c diff --git a/libavformat/nal.c b/libavformat/nal.c index 26dc5fe688..cabee37e0d 100644 --- a/libavformat/nal.c +++ b/libavformat/nal.c @@ -18,57 +18,20 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include <stdint.h> +#include "config.h" #include <string.h> #include "libavutil/mem.h" #include "libavutil/error.h" +#include "libavutil/nal.h" #include "libavcodec/defs.h" #include "avio.h" #include "avio_internal.h" #include "nal.h" -static const uint8_t *nal_find_startcode_internal(const uint8_t *p, const uint8_t *end) +const uint8_t *ff_nal_find_startcode(const uint8_t *p, const uint8_t *end) { - const uint8_t *a = p + 4 - ((intptr_t)p & 3); - - for (end -= 3; p < a && p < end; p++) { - if (p[0] == 0 && p[1] == 0 && p[2] == 1) - return p; - } - - for (end -= 3; p < end; p += 4) { - uint32_t x = *(const uint32_t*)p; -// if ((x - 0x01000100) & (~x) & 0x80008000) // little endian -// if ((x - 0x00010001) & (~x) & 0x00800080) // big endian - if ((x - 0x01010101) & (~x) & 0x80808080) { // generic - if (p[1] == 0) { - if (p[0] == 0 && p[2] == 1) - return p; - if (p[2] == 0 && p[3] == 1) - return p+1; - } - if (p[3] == 0) { - if (p[2] == 0 && p[4] == 1) - return p+2; - if (p[4] == 0 && p[5] == 1) - return p+3; - } - } - } - - for (end += 3; p < end; p++) { - if (p[0] == 0 && p[1] == 0 && p[2] == 1) - return p; - } - - return end + 3; -} - -const uint8_t *ff_nal_find_startcode(const uint8_t *p, const uint8_t *end){ - const uint8_t *out = nal_find_startcode_internal(p, end); - if(p<out && out<end && !out[-1]) out--; - return out; + return av_nal_find_startcode(p, end); } static int nal_parse_units(AVIOContext *pb, NALUList *list, diff --git a/libavutil/Makefile b/libavutil/Makefile index c5241895ff..9d7c9d8820 100644 --- a/libavutil/Makefile +++ b/libavutil/Makefile @@ -164,6 +164,7 @@ OBJS = adler32.o \ md5.o \ mem.o \ murmur3.o \ + nal.o \ opt.o \ parseutils.o \ pixdesc.o \ diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile index b70702902f..27d4b8bc65 100644 --- a/libavutil/aarch64/Makefile +++ b/libavutil/aarch64/Makefile @@ -6,6 +6,7 @@ ARMV8-OBJS += aarch64/crc.o NEON-OBJS += aarch64/float_dsp_neon.o \ aarch64/tx_float_neon.o \ + aarch64/nal.o \ SVE-OBJS += aarch64/cpu_sve.o \ diff --git a/libavutil/aarch64/nal.S b/libavutil/aarch64/nal.S new file mode 100644 index 0000000000..f6b3e4afcd --- /dev/null +++ b/libavutil/aarch64/nal.S @@ -0,0 +1,169 @@ +/* + * ARM NEON-optimized NAL startcode search + * Copyright (c) 2024 + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +function ff_nal_find_startcode_neon, export=1 + and x2, x0, #-4 // align to 4-byte boundary + sub x7, x1, #3 // end -= 3 + add x2, x2, #4 // align4 = aligned_p + 4 + mov x3, x0 // p = orig_p + cmp x0, x2 + ccmp x7, x0, #0, cc + bls 2f // skip alignment phase + + // Phase 1: align to 4-byte boundary +1: ldrb w0, [x3] + cbnz w0, 3f + ldrb w0, [x3, #1] + cbnz w0, 3f + ldrb w0, [x3, #2] + cmp w0, #1 + beq 22f // found 00 00 01 +3: add x3, x3, #1 + cmp x2, x3 + ccmp x7, x3, #0, hi + bhi 1b + +2: sub x0, x7, x3 // remaining = end - p + cmp x0, #63 + bgt 43f // enter NEON phase if >= 64 bytes + + // Phase 3: byte-by-byte check for remaining data +4: cmp x7, x3 + bls 8f +5: ldrb w0, [x3] + cbnz w0, 6f + ldrb w0, [x3, #1] + cbnz w0, 6f + ldrb w0, [x3, #2] + cmp w0, #1 + beq 22f +6: add x3, x3, #1 + cmp x7, x3 + bne 5b +8: mov x0, x1 // return orig_end + ret + + // Phase 2: NEON acceleration (64-byte blocks) +43: sub x8, x1, #66 // end64 = end - 66 + cmp x8, x3 + bls 4b + mov w6, #65279 // 0xFEFF + add x5, x3, #64 // chunk_end = p + 64 + movk w6, #0xfefe, lsl #16 // 0xFEFEFEFF + b 10f + +9: add x3, x3, #64 // p += 64 + add x5, x5, #64 // chunk_end += 64 + cmp x8, x3 + bls 4b + +10: // Load 64 bytes (4x16-byte vectors) + ldp q31, q30, [x3] // load first 32 bytes + ldp q29, q28, [x3, #32] // load next 32 bytes + prfm PLDL1KEEP, [x3, #192] // prefetch + + // Check for zero bytes (data == 0) + cmeq v31.16b, v31.16b, #0 // z0 + cmeq v30.16b, v30.16b, #0 // z1 + cmeq v29.16b, v29.16b, #0 // z2 + cmeq v28.16b, v28.16b, #0 // z3 + + // Check for 00 pattern (current byte is 0 AND next byte is 0) + ext v24.16b, v31.16b, v31.16b, #1 // zs0 + ext v27.16b, v30.16b, v30.16b, #1 // zs1 + ext v26.16b, v29.16b, v29.16b, #1 // zs2 + ext v25.16b, v28.16b, v28.16b, #1 // zs3 + + // pattern00 = zero & zero_shift + and v24.16b, v24.16b, v31.16b // p0 + and v27.16b, v27.16b, v30.16b // p1 + and v26.16b, v26.16b, v29.16b // p2 + and v25.16b, v25.16b, v28.16b // p3 + + // Check if any 00 pattern exists (fast ORR test) + orr v27.16b, v24.16b, v27.16b + orr v25.16b, v26.16b, v25.16b + orr v25.16b, v25.16b, v27.16b + dup d31, v25.d[1] + orr v31.8b, v31.8b, v25.8b + fmov x0, d31 + cbz x0, 9b // no 00 pattern, skip to next chunk + + // Detailed check of this 64-byte chunk + mov x0, x3 +11: ldr w2, [x0] + add w4, w2, w6 // x - 0x01010101 + bic w2, w4, w2 // (~x) & (x - 0x01010101) + tst w2, #-2139062144 // & 0x80808080 + beq 12f + + ldrb w2, [x0, #1] + cbnz w2, 13f + ldrb w4, [x0] + ldrb w2, [x0, #2] + cbnz w4, 14f + cmp w2, #1 + beq 18f // found 00 00 01 +14: ldrb w4, [x0, #3] + cbnz w2, 15f + cmp w4, #1 + beq 44f // found 00 00 01 (offset +1) + cbnz w4, 12f +16: ldrb w2, [x0, #4] + cmp w2, #1 + beq 45f // found 00 00 01 (offset +2) +17: cbnz w2, 12f + ldrb w2, [x0, #5] + cmp w2, #1 + beq 46f // found 00 00 01 (offset +3) + +12: add x0, x0, #4 + cmp x0, x5 + bne 11b + b 9b + +13: ldrb w2, [x0, #3] + cbnz w2, 12b + ldrb w2, [x0, #2] + cbz w2, 16b + ldrb w2, [x0, #4] + b 17b + +15: cbnz w4, 12b + ldrb w2, [x0, #4] + b 17b + +22: mov x0, x3 + ret + +45: add x0, x0, #2 + ret + +44: add x0, x0, #1 + ret + +46: add x0, x0, #3 + ret + +18: ret +endfunc diff --git a/libavutil/nal.c b/libavutil/nal.c new file mode 100644 index 0000000000..ec2d8dc46f --- /dev/null +++ b/libavutil/nal.c @@ -0,0 +1,100 @@ +/* + * NAL utility functions + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> +#include <string.h> + +#include "config.h" +#include "attributes.h" +#include "cpu.h" +#include "thread.h" +#if ARCH_AARCH64 +#include "aarch64/cpu.h" +#endif +#include "nal.h" + +const uint8_t *ff_nal_find_startcode_c(const uint8_t *p, const uint8_t *end) +{ + const uint8_t *a = p + 4 - ((intptr_t)p & 3); + + for (end -= 3; p < a && p < end; p++) { + if (p[0] == 0 && p[1] == 0 && p[2] == 1) + return p; + } + + for (end -= 3; p < end; p += 4) { + uint32_t x = *(const uint32_t*)p; +// if ((x - 0x01000100) & (~x) & 0x80008000) // little endian +// if ((x - 0x00010001) & (~x) & 0x00800080) // big endian + if ((x - 0x01010101) & (~x) & 0x80808080) { // generic + if (p[1] == 0) { + if (p[0] == 0 && p[2] == 1) + return p; + if (p[2] == 0 && p[3] == 1) + return p+1; + } + if (p[3] == 0) { + if (p[2] == 0 && p[4] == 1) + return p+2; + if (p[4] == 0 && p[5] == 1) + return p+3; + } + } + } + + for (end += 3; p < end; p++) { + if (p[0] == 0 && p[1] == 0 && p[2] == 1) + return p; + } + + return end + 3; +} + +// Function pointer to the active implementation +static const uint8_t *(*nal_find_startcode_func)(const uint8_t *p, const uint8_t *end); + +// Thread-safe initialization using ff_thread_once +static AVOnce nal_func_init_once = AV_ONCE_INIT; + +static void nal_find_startcode_init(void) +{ +#if ARCH_AARCH64 + int cpu_flags = av_get_cpu_flags(); + if (have_neon(cpu_flags)) + nal_find_startcode_func = ff_nal_find_startcode_neon; + else + nal_find_startcode_func = ff_nal_find_startcode_c; +#else + nal_find_startcode_func = ff_nal_find_startcode_c; +#endif +} + +const uint8_t *av_nal_find_startcode(const uint8_t *p, const uint8_t *end) +{ + // Initialize function pointer on first call (thread-safe) + ff_thread_once(&nal_func_init_once, nal_find_startcode_init); + + // Call the optimized implementation + p = nal_find_startcode_func(p, end); + + if (p < end && !p[-1]) + p--; + return p; +} diff --git a/libavutil/nal.h b/libavutil/nal.h new file mode 100644 index 0000000000..ccc3f7b70d --- /dev/null +++ b/libavutil/nal.h @@ -0,0 +1,60 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_NAL_H +#define AVUTIL_NAL_H + +#include <stdint.h> + +/** + * @file + * @ingroup lavu_misc + * NAL (Network Abstraction Layer) utility functions + */ + +/** + * @addtogroup lavu_misc + * @{ + */ + +/** + * Find a H.264/H.265 NAL startcode (00 00 01 or 00 00 00 01) in a buffer. + * + * @param p Pointer to start searching from + * @param end Pointer to end of buffer (must have at least 3 bytes padding) + * @return Pointer to startcode, or end+3 if not found + * + * @note This function searches for the pattern 00 00 01 (three-byte startcode) + * or 00 00 00 01 (four-byte startcode). When found, it returns a pointer + * to the startcode. If the byte before the startcode is also 0, it returns + * that position instead (to handle the four-byte case). + * If no startcode is found, returns end + 3. + */ +const uint8_t *av_nal_find_startcode(const uint8_t *p, const uint8_t *end); + +/* Internal implementations exposed for testing */ +const uint8_t *ff_nal_find_startcode_c(const uint8_t *p, const uint8_t *end); +#if ARCH_AARCH64 +const uint8_t *ff_nal_find_startcode_neon(const uint8_t *p, const uint8_t *end); +#endif + +/** + * @} + */ + +#endif /* AVUTIL_NAL_H */ diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 48f358d40d..54f034caae 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -95,6 +95,7 @@ AVUTILOBJS += crc.o AVUTILOBJS += fixed_dsp.o AVUTILOBJS += float_dsp.o AVUTILOBJS += lls.o +AVUTILOBJS += nal.o CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 7dcdaeb2a4..835ae89905 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -348,6 +348,7 @@ static const struct { { "fixed_dsp", checkasm_check_fixed_dsp }, { "float_dsp", checkasm_check_float_dsp }, { "lls", checkasm_check_lls }, + { "nal", checkasm_check_nal }, { "av_tx", checkasm_check_av_tx }, #endif { NULL } diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index e3addec21e..711036873b 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -120,6 +120,7 @@ void checkasm_check_idet(void); void checkasm_check_jpeg2000dsp(void); void checkasm_check_llauddsp(void); void checkasm_check_lls(void); +void checkasm_check_nal(void); void checkasm_check_llviddsp(void); void checkasm_check_llvidencdsp(void); void checkasm_check_lpc(void); diff --git a/tests/checkasm/nal.c b/tests/checkasm/nal.c new file mode 100644 index 0000000000..97ef2909cb --- /dev/null +++ b/tests/checkasm/nal.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2024 + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <string.h> + +#include "libavutil/common.h" +#include "libavcodec/defs.h" +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" +#include "libavutil/cpu.h" +#if ARCH_AARCH64 +#include "libavutil/aarch64/cpu.h" +#endif + +#include "checkasm.h" + +#include "libavutil/nal.h" + +#define BUF_SIZE (8192 + AV_INPUT_BUFFER_PADDING_SIZE) + +void checkasm_check_nal(void) +{ + LOCAL_ALIGNED_8(uint8_t, buf, [BUF_SIZE]); + const uint8_t *ref_res, *new_res; + +#if ARCH_AARCH64 + int cpu_flags = av_get_cpu_flags(); + int have_neon_impl = have_neon(cpu_flags); + + if (have_neon_impl) { + declare_func(const uint8_t *, const uint8_t *p, const uint8_t *end); + + // Set C version as reference implementation + func_ref = ff_nal_find_startcode_c; + + // Test 1: Startcode at beginning + memset(buf, 0xFF, BUF_SIZE); + AV_WN32A(buf, 0x01000000); + if (check_func(ff_nal_find_startcode_neon, "startcode_at_beginning")) { + ref_res = call_ref(buf, buf + 8192); + new_res = call_new(buf, buf + 8192); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 8192); + } + + // Test 2: Startcode at offset 4 (three-byte) + memset(buf, 0xFF, BUF_SIZE); + AV_WN32A(buf + 4, 0x010000); + if (check_func(ff_nal_find_startcode_neon, "startcode_at_offset_4")) { + ref_res = call_ref(buf, buf + 8192); + new_res = call_new(buf, buf + 8192); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 8192); + } + + // Test 3: Multiple startcodes, find first one + memset(buf, 0, BUF_SIZE); + AV_WN32A(buf + 100, 0x01000000); + AV_WN32A(buf + 500, 0x01000000); + AV_WN32A(buf + 1000, 0x01000000); + if (check_func(ff_nal_find_startcode_neon, "multiple_startcodes")) { + ref_res = call_ref(buf, buf + 8192); + new_res = call_new(buf, buf + 8192); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 8192); + } + + // Test 4: No startcode (all 0xFF) - CRITICAL TEST + memset(buf, 0xFF, 256); + memset(buf + 256, 0, AV_INPUT_BUFFER_PADDING_SIZE); + if (check_func(ff_nal_find_startcode_neon, "no_startcode_0xFF")) { + ref_res = call_ref(buf, buf + 256); + new_res = call_new(buf, buf + 256); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 256); + } + + // Test 5: No startcode (all zeros) + memset(buf, 0, 256); + memset(buf + 256, 0, AV_INPUT_BUFFER_PADDING_SIZE); + if (check_func(ff_nal_find_startcode_neon, "no_startcode_zeros")) { + ref_res = call_ref(buf, buf + 256); + new_res = call_new(buf, buf + 256); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 256); + } + + // Test 6: Startcode near end + memset(buf, 0xFF, BUF_SIZE); + AV_WN32A(buf + 8188, 0x01000000); + if (check_func(ff_nal_find_startcode_neon, "startcode_near_end")) { + ref_res = call_ref(buf, buf + 8192); + new_res = call_new(buf, buf + 8192); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 8192); + } + + // Test 7: Search from middle + memset(buf, 0, BUF_SIZE); + AV_WN32A(buf + 100, 0x01000000); + AV_WN32A(buf + 500, 0x01000000); + if (check_func(ff_nal_find_startcode_neon, "search_from_middle")) { + ref_res = call_ref(buf + 200, buf + 8192); + new_res = call_new(buf + 200, buf + 8192); + if (ref_res != new_res) + fail(); + bench_new(buf + 200, buf + 8192); + } + + // Test 8: Small buffer (16 bytes) + memset(buf, 0xFF, 16); + memset(buf + 16, 0, AV_INPUT_BUFFER_PADDING_SIZE); + if (check_func(ff_nal_find_startcode_neon, "small_buffer_16")) { + ref_res = call_ref(buf, buf + 16); + new_res = call_new(buf, buf + 16); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 16); + } + + // Test 9: Very small buffer (4 bytes) + memset(buf, 0xFF, 4); + memset(buf + 4, 0, AV_INPUT_BUFFER_PADDING_SIZE); + if (check_func(ff_nal_find_startcode_neon, "tiny_buffer_4")) { + ref_res = call_ref(buf, buf + 4); + new_res = call_new(buf, buf + 4); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 4); + } + + // Test 10: Three-byte startcode pattern + memset(buf, 0xFF, BUF_SIZE); + buf[50] = 0x00; + buf[51] = 0x00; + buf[52] = 0x01; + if (check_func(ff_nal_find_startcode_neon, "three_byte_startcode")) { + ref_res = call_ref(buf, buf + 8192); + new_res = call_new(buf, buf + 8192); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 8192); + } + + // Test 11: Random data with startcode + for (int i = 0; i < 8192; i++) { + buf[i] = rnd() & 0xFF; + } + memset(buf + 8192, 0, AV_INPUT_BUFFER_PADDING_SIZE); + int pos = (rnd() & 0x1FFF) + 100; + AV_WN32A(buf + pos, 0x01000000); + if (check_func(ff_nal_find_startcode_neon, "random_with_startcode")) { + ref_res = call_ref(buf, buf + 8192); + new_res = call_new(buf, buf + 8192); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 8192); + } + + // Test 12: Large buffer with no startcode + memset(buf, 0xAA, 8192); + memset(buf + 8192, 0, AV_INPUT_BUFFER_PADDING_SIZE); + if (check_func(ff_nal_find_startcode_neon, "large_no_startcode")) { + ref_res = call_ref(buf, buf + 8192); + new_res = call_new(buf, buf + 8192); + if (ref_res != new_res) + fail(); + bench_new(buf, buf + 8192); + } + } +#endif + + report("nal"); +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index b08c1947cd..aac7e62ffd 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -36,6 +36,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-jpeg2000dsp \ fate-checkasm-llauddsp \ fate-checkasm-lls \ + fate-checkasm-nal \ fate-checkasm-llviddsp \ fate-checkasm-llvidencdsp \ fate-checkasm-lpc \ -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
