On date Saturday 2015-06-13 14:20:07 +0200, Hendrik Leppkes encoded: > On Thu, Jun 11, 2015 at 8:54 PM, wm4 <nfx...@googlemail.com> wrote: > > On Thu, 11 Jun 2015 17:24:45 +0200 > > Stefano Sabatini <stefa...@gmail.com> wrote: > > > >> Next step would be the use of YASM, but I only want to test if the > >> general approach is fine (and if the API is not too specific). Also if > >> someone wants to step up and port it to YASM I'm all for it, since > >> ASM/YASM is far from being my area of expertise. > > > > Personally, I'd probably just > > 1. export the GPU memcpy function, and > > 2. export a function to copy AVFrames using this function > > I concur. A basic optimized memcpy with specific constraints (ie. > requires aligned input/output, always copies in 16-byte chunks, so > in/out buffers need to be padded appropriately), to keep the required > ASM code simple. > These constraints are generally always fulfilled if you have a GPU > frame on the input, since they will have appropriate strides (and if > in question, we control allocation of the GPU surfaces as well), and > we control the output memory buffer anyway. > > On top of that a convenience function that deals with pixel formats, > strides, planes, and whatnot, and then uses this function. > A generic C version of the basic copy function shouldn't be needed, we > could just use memcpy for that.. or a tiny wrapper that calls memcpy, > anyway.
This is my first attempt, the added function is named av_memcpynt(), it is using inline assembly which should be replaced by yasm once me or someone else figures out how to do it. An av_image_copynt_plane() function can be built on top of that (but in this case it would be better to inline the av_memcpynt() function). BTW I dropped the requirement of 16-bits alignment on the size variable which is required by the VLC code but which looks unnecessary to me. -- FFmpeg = Furious and Foolish Marvellous Pacific Egregious Ghost
>From 3a75ef1e86360cd6f30b8e550307404d0d1c1dba Mon Sep 17 00:00:00 2001 From: Stefano Sabatini <stefa...@gmail.com> Date: Mon, 15 Jun 2015 11:02:50 +0200 Subject: [PATCH] lavu/mem: add av_memcpynt() function with x86 optimizations Assembly based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar <fen...@videolan.org>. TODO: bump minor, update APIchanges --- libavutil/mem.c | 9 +++++ libavutil/mem.h | 14 ++++++++ libavutil/mem_internal.h | 26 +++++++++++++++ libavutil/x86/Makefile | 1 + libavutil/x86/mem.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 135 insertions(+) create mode 100644 libavutil/mem_internal.h create mode 100644 libavutil/x86/mem.c diff --git a/libavutil/mem.c b/libavutil/mem.c index da291fb..0e1eb01 100644 --- a/libavutil/mem.c +++ b/libavutil/mem.c @@ -42,6 +42,7 @@ #include "dynarray.h" #include "intreadwrite.h" #include "mem.h" +#include "mem_internal.h" #ifdef MALLOC_PREFIX @@ -515,3 +516,11 @@ void av_fast_malloc(void *ptr, unsigned int *size, size_t min_size) ff_fast_malloc(ptr, size, min_size, 0); } +void av_memcpynt(void *dst, const void *src, size_t size, int cpu_flags) +{ +#if ARCH_X86 + ff_memcpynt_x86(dst, src, size, cpu_flags); +#else + memcpy(dst, src, size, cpu_flags); +#endif +} diff --git a/libavutil/mem.h b/libavutil/mem.h index 2a1e36d..bbad313 100644 --- a/libavutil/mem.h +++ b/libavutil/mem.h @@ -383,6 +383,20 @@ void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size); void av_fast_malloc(void *ptr, unsigned int *size, size_t min_size); /** + * Copy size bytes from from src to dst, using non-temporal copy + * functions when available. + * + * This function works as memcpy, but adopts non-temporal instructios + * when available. This can lead to better performances when + * transferring data from source to destination is expensive, for + * example when reading from GPU memory. + * + * @param dst destination memory pointer, must be aligned to 16 bits + * @param cpu_flags as returned by av_get_cpu_flags() + */ +void av_memcpynt(void *dst, const void *src, size_t size, int cpu_flags); + +/** * @} */ diff --git a/libavutil/mem_internal.h b/libavutil/mem_internal.h new file mode 100644 index 0000000..371be31 --- /dev/null +++ b/libavutil/mem_internal.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_MEM_INTERNAL_H +#define AVUTIL_MEM_INTERNAL_H + +#include "mem.h" + +void ff_memcpynt_x86(void *dst, const void *src, size_t size, int cpu_flags); + +#endif /* AVUTIL_MEM_INTERNAL_H */ diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile index a719c00..171c351 100644 --- a/libavutil/x86/Makefile +++ b/libavutil/x86/Makefile @@ -2,6 +2,7 @@ OBJS += x86/cpu.o \ x86/float_dsp_init.o \ x86/imgutils.o \ x86/lls_init.o \ + x86/mem.o \ OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \ diff --git a/libavutil/x86/mem.c b/libavutil/x86/mem.c new file mode 100644 index 0000000..fef3b7f --- /dev/null +++ b/libavutil/x86/mem.c @@ -0,0 +1,85 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <inttypes.h> +#include "config.h" +#include "libavutil/avassert.h" +#include "libavutil/mem_internal.h" + +#if HAVE_SSE2 +/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction + * load and storing data with the SSE>=2 instruction store. + */ +#define COPY16(dstp, srcp, load, store) \ + __asm__ volatile ( \ + load " 0(%[src]), %%xmm1\n" \ + store " %%xmm1, 0(%[dst])\n" \ + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1") + +#define COPY64(dstp, srcp, load, store) \ + __asm__ volatile ( \ + load " 0(%[src]), %%xmm1\n" \ + load " 16(%[src]), %%xmm2\n" \ + load " 32(%[src]), %%xmm3\n" \ + load " 48(%[src]), %%xmm4\n" \ + store " %%xmm1, 0(%[dst])\n" \ + store " %%xmm2, 16(%[dst])\n" \ + store " %%xmm3, 32(%[dst])\n" \ + store " %%xmm4, 48(%[dst])\n" \ + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4") +#endif + +#define COPY_LINE(dstp, srcp, size, load) \ + const unsigned unaligned = (-(uintptr_t)srcp) & 0x0f; \ + unsigned x = unaligned; \ + \ + av_assert0(((intptr_t)dstp & 0x0f) == 0); \ + \ + __asm__ volatile ("mfence"); \ + if (!unaligned) { \ + for (; x+63 < size; x += 64) \ + COPY64(&dstp[x], &srcp[x], load, "movdqa"); \ + } else { \ + COPY16(dst, src, "movdqu", "movdqa"); \ + for (; x+63 < size; x += 64) \ + COPY64(&dstp[x], &srcp[x], load, "movdqu"); \ + } \ + \ + for (; x < size; x++) \ + dstp[x] = srcp[x]; \ + __asm__ volatile ("mfence"); + +void ff_memcpynt_x86(void *dst, const void *src, size_t size, int cpu_flags) +{ + uint8_t *dstu = dst; + const uint8_t *srcu = src; + +#if HAVE_SSE4 + if (cpu_flags & AV_CPU_FLAG_SSE4) { + COPY_LINE(dstu, srcu, size, "movntdqa"); + return; + } +#endif +#if HAVE_SSE2 + if (cpu_flags & AV_CPU_FLAG_SSE2) { + COPY_LINE(dstu, srcu, size, "movdqa"); + return; + } +#endif + memcpy(dst, src, size); +} -- 1.9.1
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel