On date Tuesday 2015-06-16 10:20:31 +0200, wm4 encoded:
> On Mon, 15 Jun 2015 17:55:35 +0200
> Stefano Sabatini <stefa...@gmail.com> wrote:
> 
> > On date Monday 2015-06-15 11:56:13 +0200, Stefano Sabatini encoded:
> > [...]
> > > From 3a75ef1e86360cd6f30b8e550307404d0d1c1dba Mon Sep 17 00:00:00 2001
> > > From: Stefano Sabatini <stefa...@gmail.com>
> > > Date: Mon, 15 Jun 2015 11:02:50 +0200
> > > Subject: [PATCH] lavu/mem: add av_memcpynt() function with x86 
> > > optimizations
> > > 
> > > Assembly based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar
> > > <fen...@videolan.org>.
> > > 
> > > TODO: bump minor, update APIchanges
> > > ---
> > >  libavutil/mem.c          |  9 +++++
> > >  libavutil/mem.h          | 14 ++++++++
> > >  libavutil/mem_internal.h | 26 +++++++++++++++
> > >  libavutil/x86/Makefile   |  1 +
> > >  libavutil/x86/mem.c      | 85 
> > > ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  5 files changed, 135 insertions(+)
> > >  create mode 100644 libavutil/mem_internal.h
> > >  create mode 100644 libavutil/x86/mem.c
> > > 
> > > diff --git a/libavutil/mem.c b/libavutil/mem.c
> > > index da291fb..0e1eb01 100644
> > > --- a/libavutil/mem.c
> > > +++ b/libavutil/mem.c
> > > @@ -42,6 +42,7 @@
> > >  #include "dynarray.h"
> > >  #include "intreadwrite.h"
> > >  #include "mem.h"
> > > +#include "mem_internal.h"
> > >  
> > >  #ifdef MALLOC_PREFIX
> > >  
> > > @@ -515,3 +516,11 @@ void av_fast_malloc(void *ptr, unsigned int *size, 
> > > size_t min_size)
> > >      ff_fast_malloc(ptr, size, min_size, 0);
> > >  }
> > >  
> > > +void av_memcpynt(void *dst, const void *src, size_t size, int cpu_flags)
> > > +{
> > > +#if ARCH_X86
> > > +    ff_memcpynt_x86(dst, src, size, cpu_flags);
> > > +#else
> > > +    memcpy(dst, src, size, cpu_flags);
> > > +#endif
> > > +}
> > 
> > Alternatively, what about something like:
> > 
> > av_memcpynt_fn av_memcpynt_get_fn(void);
> > 
> > modeled after av_pixelutils_get_sad_fn()? This would skip the need for
> > a wrapper calling the right function.
> 

> I don't see much value in this, unless determining the right function
> causes too much overhead.

I see two advantages, 1. no branch and function call when the function
is called, 2. the cpu_flags must not be passed around, so it's somehow
safer.

I have no strong preference though, updated (untested patch) in
attachment.
-- 
FFmpeg = Fierce and Forgiving Merciless Powered Extroverse Gargoyle
>From c005ff5405dd48e6b0fed24ed94947f69bfe2783 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefa...@gmail.com>
Date: Mon, 15 Jun 2015 11:02:50 +0200
Subject: [PATCH] lavu/mem: add av_memcpynt_get_fn()

Assembly based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar
<fen...@videolan.org>.

TODO: remove use of inline assembly, bump minor, update APIchanges
---
 libavutil/mem.c          |  9 +++++
 libavutil/mem.h          | 13 +++++++
 libavutil/mem_internal.h | 26 +++++++++++++
 libavutil/x86/Makefile   |  1 +
 libavutil/x86/mem.c      | 98 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 147 insertions(+)
 create mode 100644 libavutil/mem_internal.h
 create mode 100644 libavutil/x86/mem.c

diff --git a/libavutil/mem.c b/libavutil/mem.c
index da291fb..325bfc9 100644
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@@ -42,6 +42,7 @@
 #include "dynarray.h"
 #include "intreadwrite.h"
 #include "mem.h"
+#include "mem_internal.h"
 
 #ifdef MALLOC_PREFIX
 
@@ -515,3 +516,11 @@ void av_fast_malloc(void *ptr, unsigned int *size, size_t min_size)
     ff_fast_malloc(ptr, size, min_size, 0);
 }
 
+av_memcpynt_fn av_memcpynt_get_fn(void)
+{
+#if ARCH_X86
+    return ff_memcpynt_get_fn_x86();
+#else
+    return memcpy;
+#endif
+}
diff --git a/libavutil/mem.h b/libavutil/mem.h
index 2a1e36d..d9f1b7a 100644
--- a/libavutil/mem.h
+++ b/libavutil/mem.h
@@ -382,6 +382,19 @@ void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size);
  */
 void av_fast_malloc(void *ptr, unsigned int *size, size_t min_size);
 
+typedef void* (*av_memcpynt_fn)(void *dst, const void *src, size_t size);
+
+/**
+ * Return possibly optimized function to copy size bytes from from src
+ * to dst, using non-temporal copy.
+ *
+ * The returned function works as memcpy, but adopts non-temporal
+ * instructios when available. This can lead to better performances
+ * when transferring data from source to destination is expensive, for
+ * example when reading from GPU memory.
+ */
+av_memcpynt_fn av_memcpynt_get_fn(void);
+
 /**
  * @}
  */
diff --git a/libavutil/mem_internal.h b/libavutil/mem_internal.h
new file mode 100644
index 0000000..de61cba
--- /dev/null
+++ b/libavutil/mem_internal.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_MEM_INTERNAL_H
+#define AVUTIL_MEM_INTERNAL_H
+
+#include "mem.h"
+
+av_memcpynt_fn ff_memcpynt_get_fn_x86(void);
+
+#endif /* AVUTIL_MEM_INTERNAL_H */
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index a719c00..171c351 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -2,6 +2,7 @@ OBJS += x86/cpu.o                                                       \
         x86/float_dsp_init.o                                            \
         x86/imgutils.o                                                  \
         x86/lls_init.o                                                  \
+        x86/mem.o                                                       \
 
 OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o                      \
 
diff --git a/libavutil/x86/mem.c b/libavutil/x86/mem.c
new file mode 100644
index 0000000..6326c90
--- /dev/null
+++ b/libavutil/x86/mem.c
@@ -0,0 +1,98 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <inttypes.h>
+#include "config.h"
+#include "libavutil/avassert.h"
+#include "libavutil/mem_internal.h"
+
+#if HAVE_SSE2
+/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
+ * load and storing data with the SSE>=2 instruction store.
+ */
+#define COPY16(dstp, srcp, load, store) \
+    __asm__ volatile (                  \
+        load "  0(%[src]), %%xmm1\n"    \
+        store " %%xmm1,    0(%[dst])\n" \
+        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
+
+#define COPY64(dstp, srcp, load, store) \
+    __asm__ volatile (                  \
+        load "  0(%[src]), %%xmm1\n"    \
+        load " 16(%[src]), %%xmm2\n"    \
+        load " 32(%[src]), %%xmm3\n"    \
+        load " 48(%[src]), %%xmm4\n"    \
+        store " %%xmm1,    0(%[dst])\n" \
+        store " %%xmm2,   16(%[dst])\n" \
+        store " %%xmm3,   32(%[dst])\n" \
+        store " %%xmm4,   48(%[dst])\n" \
+        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
+#endif
+
+#define COPY_LINE(dstp, srcp, size, load)                               \
+    const unsigned unaligned = (-(uintptr_t)srcp) & 0x0f;               \
+    unsigned x = unaligned;                                             \
+                                                                        \
+    av_assert0(((intptr_t)dstp & 0x0f) == 0);                           \
+                                                                        \
+    __asm__ volatile ("mfence");                                        \
+    if (!unaligned) {                                                   \
+        for (; x+63 < size; x += 64)                                    \
+            COPY64(&dstp[x], &srcp[x], load, "movdqa");                 \
+    } else {                                                            \
+        COPY16(dst, src, "movdqu", "movdqa");                           \
+        for (; x+63 < size; x += 64)                                    \
+            COPY64(&dstp[x], &srcp[x], load, "movdqu");                 \
+    }                                                                   \
+                                                                        \
+    for (; x < size; x++)                                               \
+        dstp[x] = srcp[x];                                              \
+    __asm__ volatile ("mfence");
+
+static void* ff_memcpynt_sse4(void *dst, const void *src, size_t size)
+{
+    uint8_t *dstu = dst;
+    const uint8_t *srcu = src;
+
+    COPY_LINE(dstu, srcu, size, "movntdqa");
+    return dst;
+}
+
+static void* ff_memcpynt_sse2(void *dst, const void *src, size_t size)
+{
+    uint8_t *dstu = dst;
+    const uint8_t *srcu = src;
+
+    COPY_LINE(dstu, srcu, size, "movntdqa");
+    return dst;
+}
+
+av_memcpynt_fn ff_memcpynt_get_fn_x86()
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if HAVE_SSE4
+    if (cpu_flags & AV_CPU_FLAG_SSE4)
+        return ff_memcpynt_sse4;
+#endif
+#if HAVE_SSE2
+    if (cpu_flags & AV_CPU_FLAG_SSE2)
+        return ff_memcpynt_sse2;
+#endif
+    return memcpy;
+}
-- 
1.9.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to