[mpeg2-dev] [RFC] [PATCH] ARM Advanced SIMD motion compensation

Rémi Denis-Courmont Tue, 15 Sep 2009 13:32:55 -0700

        Hello all,

ARMv7 includes an optional "Advanced SIMD" instructions set, commercially 
known as NEON. This is included in the recent Cortex line of ARM processors. 
In particular, Cortex-A8 is found on TI-OMAP3xxx boards such as BeagleBoard, 
or the Nokia N900.


Attached is an intial patch against libmpeg2 trunk to use NEON for motion 
compensation. This is preliminary. There are a bunch of known CPU stalls. 
Those could probably be fixed using plain assembly and interleaving subsequent 
loads. Also, iDCT is not optimized. Anyway, here are my results with an 
OMA3430 board:

With C, no acceleration:
7305 frames in 19.87 sec (367.64 fps), 155 last 0.50 sec (310.00 fps)
7308 frames decoded in 19.88 seconds (367.61 fps)                    
7288 frames in 19.88 sec (366.60 fps), 170 last 0.50 sec (340.00 fps)
7308 frames decoded in 19.95 seconds (366.32 fps)                    

With ARM acceleration (current libmpeg2):
7254 frames in 18.88 sec (384.22 fps), 180 last 0.50 sec (360.00 fps)
7308 frames decoded in 19.04 seconds (383.82 fps)
7263 frames in 18.88 sec (384.69 fps), 175 last 0.50 sec (350.00 fps)
7308 frames decoded in 19.02 seconds (384.23 fps)

With NEON acceleration (this patch):
7129 frames in 15.39 sec (463.22 fps), 245 last 0.50 sec (490.00 fps)
7308 frames decoded in 15.85 seconds (461.07 fps)
7127 frames in 15.38 sec (463.39 fps), 245 last 0.50 sec (490.00 fps)
7308 frames decoded in 15.85 seconds (461.07 fps)

So, there is already quite a big improvement!

I wonder if there is any warranty on the memory alignment of some of the 
buffers? NEON can save one cycle per load/store we use aligned-specific 
opcodes. Currently, the code assumes no alignment.

Comments welcome!

-- 
Rémi Denis-Courmont
http://git.remlab.net/cgi-bin/gitweb.cgi?p=vlc-courmisch.git;a=summary

Index: include/mpeg2.h
===================================================================
--- include/mpeg2.h	(révision 1204)
+++ include/mpeg2.h	(copie de travail)
@@ -163,6 +163,7 @@
 #define MPEG2_ACCEL_SPARC_VIS 1
 #define MPEG2_ACCEL_SPARC_VIS2 2
 #define MPEG2_ACCEL_ARM 1
+#define MPEG2_ACCEL_ARM_NEON 2
 #define MPEG2_ACCEL_DETECT 0x80000000
 
 uint32_t mpeg2_accel (uint32_t accel);
Index: libmpeg2/motion_comp_neon.c
===================================================================
--- libmpeg2/motion_comp_neon.c	(révision 0)
+++ libmpeg2/motion_comp_neon.c	(révision 0)
@@ -0,0 +1,302 @@
+/*
+ * motion_comp_neon.c
+ * Copyright (C) 2009 Rémi Denis-Courmont
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpeg2dec; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "config.h"
+
+#if defined(ARCH_ARM_NEON)
+
+#include <stdint.h>
+#include <string.h>
+
+#include "mpeg2.h"
+#include "attributes.h"
+#include "mpeg2_internal.h"
+
+/* dest = ref */
+static void MC_put_o_16_neon (uint8_t * dest, const uint8_t * ref,
+			      const int stride, int height)
+{
+    do {
+	memcpy (dest, ref, 16);
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_put_o_8_neon (uint8_t * dest, const uint8_t * ref,
+			     const int stride, int height)
+{
+    do {
+	memcpy (dest, ref, 8);
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+/* dest = (src1 + src2 + 1) / 2 */
+static void MC_avg_1_16_neon (uint8_t * dest, const uint8_t * src1,
+			      const uint8_t * src2,
+			      const int stride, unsigned height)
+{
+    do {
+	asm volatile (
+	    "vld1.u8 {q0}, [%[src1]]\n"
+	    "vld1.u8 {q1}, [%[src2]]\n"
+	    "vrhadd.u8 q0, q0, q1\n"
+	    /* XXX: three cycles stall */
+	    "vst1.u8 {q0}, [%[dest]]\n"
+	    :
+	    : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2)
+	    : "memory", "q0", "q1");
+	src1 += stride;
+	src2 += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_1_8_neon (uint8_t * dest, const uint8_t * src1,
+			     const uint8_t * src2,
+			     const int stride, unsigned height)
+{
+    do {
+	asm volatile (
+	    "vld1.u8 {d0}, [%[src1]]\n"
+	    "vld1.u8 {d1}, [%[src2]]\n"
+	    "vrhadd.u8 d0, d0, d1\n"
+	    "vst1.u8 {d0}, [%[dest]]\n"
+	    :
+	    : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2)
+	    : "memory", "q0");
+	
+	src1 += stride;
+	src2 += stride;
+	dest += stride;
+    } while (--height);
+}
+
+/* dest = (dest + ((src1 + src2 + 1) / 2) + 1) / 2 */
+static void MC_avg_2_16_neon (uint8_t * dest, const uint8_t * src1,
+			      const uint8_t * src2,
+			      const int stride, unsigned height)
+{
+    do {
+	asm volatile (
+	    "vld1.u8 {q0}, [%[src1]]\n"
+	    "vld1.u8 {q1}, [%[src2]]\n"
+	    "vrhadd.u8 q0, q0, q1\n"
+	    "vld1.u8 {q2}, [%[dest]]\n"
+	    /* XXX: one cycle stall */
+	    "vrhadd.u8 q0, q0, q2\n"
+	    /* XXX: three cycles stall */
+	    "vst1.u8 {q0}, [%[dest]]\n"
+	    :
+	    : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2)
+	    : "memory", "q0", "q1", "q2");
+	src1 += stride;
+	src2 += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_2_8_neon (uint8_t * dest, const uint8_t * src1,
+			     const uint8_t * src2,
+			     const int stride, unsigned height)
+{
+    do {
+	asm volatile (
+	    "vld1.u8 {d0}, [%[src1]]\n"
+	    "vld1.u8 {d1}, [%[src2]]\n"
+	    "vrhadd.u8 d0, d0, d1\n"
+	    "vld1.u8 {d2}, [%[dest]]\n"
+	    "vrhadd.u8 d0, d0, d2\n"
+	    "vst1.u8 {d0}, [%[dest]]\n"
+	    :
+	    : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2)
+	    : "memory", "q0", "d2");
+	src1 += stride;
+	src2 += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_o_16_neon (uint8_t * dest, const uint8_t * ref,
+			      const int stride, int height)
+{
+    MC_avg_1_16_neon (dest, dest, ref, stride, height);
+}
+
+static void MC_avg_o_8_neon (uint8_t * dest, const uint8_t * ref,
+			     const int stride, int height)
+{
+    MC_avg_1_8_neon (dest, dest, ref, stride, height);
+}
+
+static void MC_put_x_16_neon (uint8_t * dest, const uint8_t * ref,
+			      const int stride, int height)
+{
+    MC_avg_1_16_neon (dest, ref, ref + 1, stride, height);
+}
+
+static void MC_put_x_8_neon (uint8_t * dest, const uint8_t * ref,
+			     const int stride, int height)
+{
+    MC_avg_1_8_neon (dest, ref, ref + 1, stride, height);
+}
+
+static void MC_avg_x_16_neon (uint8_t * dest, const uint8_t * ref,
+			      const int stride, int height)
+{
+    MC_avg_2_16_neon (dest, ref, ref + 1, stride, height);
+}
+
+static void MC_avg_x_8_neon (uint8_t * dest, const uint8_t * ref,
+			     const int stride, int height)
+{
+    MC_avg_2_8_neon (dest, ref, ref + 1, stride, height);
+}
+
+static void MC_put_y_16_neon (uint8_t * dest, const uint8_t * ref,
+			      const int stride, int height)
+{
+    MC_avg_1_16_neon (dest, ref, ref + stride, stride, height);
+}
+static void MC_put_y_8_neon (uint8_t * dest, const uint8_t * ref,
+			     const int stride, int height)
+{
+    MC_avg_1_8_neon (dest, ref, ref + stride, stride, height);
+}
+
+static void MC_avg_y_16_neon (uint8_t * dest, const uint8_t * ref,
+			      const int stride, int height)
+{
+    MC_avg_2_16_neon (dest, ref, ref + stride, stride, height);
+}
+
+static void MC_avg_y_8_neon (uint8_t * dest, const uint8_t * ref,
+			     const int stride, int height)
+{
+    MC_avg_2_8_neon (dest, ref, ref + stride, stride, height);
+}
+
+static void MC_put_xy_16_neon (uint8_t * dest, const uint8_t * ref,
+			       const int stride, int height)
+{
+    do {
+	asm volatile (
+	    "vld1.u8 {q0}, [%[ref]]\n"
+	    "vld1.u8 {q1}, [%[refx]]\n"
+	    "vrhadd.u8 q0, q0, q1\n"
+	    "vld1.u8 {q2}, [%[refy]]\n"
+	    "vld1.u8 {q3}, [%[refxy]]\n"
+	    "vrhadd.u8 q2, q2, q3\n"
+	    /* XXX: three cycles stall */
+	    "vrhadd.u8 q0, q0, q2\n"
+	    /* XXX: three cycles stall */
+	    "vst1.u8 {q0}, [%[dest]]\n"
+	    :
+	    : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1),
+		       [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1)
+	    : "memory", "q0", "q1", "q2", "q3");
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_put_xy_8_neon (uint8_t * dest, const uint8_t * ref,
+			      const int stride, int height)
+{
+    do {
+	asm volatile (
+	    "vld1.u8 {d0}, [%[ref]]\n"
+	    "vld1.u8 {d1}, [%[refx]]\n"
+	    "vrhadd.u8 d0, d0, d1\n"
+	    "vld1.u8 {d2}, [%[refy]]\n"
+	    "vld1.u8 {d3}, [%[refxy]]\n"
+	    "vrhadd.u8 d2, d2, d3\n"
+	    /* XXX: three cycles stall */
+	    "vrhadd.u8 d0, d0, d2\n"
+	    /* XXX: three cycles stall */
+	    "vst1.u8 {d0}, [%[dest]]\n"
+	    :
+	    : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1),
+		       [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1)
+	    : "memory", "q0", "q1");
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_xy_16_neon (uint8_t * dest, const uint8_t * ref,
+			       const int stride, int height)
+{
+    do {
+	asm volatile (
+	    "vld1.u8 {q0}, [%[ref]]\n"
+	    "vld1.u8 {q1}, [%[refx]]\n"
+	    "vrhadd.u8 q0, q0, q1\n"
+	    "vld1.u8 {q2}, [%[refy]]\n"
+	    "vld1.u8 {q3}, [%[refxy]]\n"
+	    "vrhadd.u8 q2, q2, q3\n"
+	    "vld1.u8 {q4}, [%[dest]]\n"
+	    /* XXX: one cycle stall */
+	    "vrhadd.u8 q0, q0, q2\n"
+	    /* XXX: three cycles stall */
+	    "vrhadd.u8 q0, q4, q0\n"
+	    "vst1.u8 {q0}, [%[dest]]\n"
+	    :
+	    : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1),
+		       [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1)
+	    : "memory", "q0", "q1", "q2", "q3", "q4");
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+static void MC_avg_xy_8_neon (uint8_t * dest, const uint8_t * ref,
+			      const int stride, int height)
+{
+    do {
+	asm volatile (
+	    "vld1.u8 {d0}, [%[ref]]\n"
+	    "vld1.u8 {d1}, [%[refx]]\n"
+	    "vrhadd.u8 d0, d0, d1\n"
+	    "vld1.u8 {d2}, [%[refy]]\n"
+	    "vld1.u8 {d3}, [%[refxy]]\n"
+	    "vrhadd.u8 d2, d2, d3\n"
+	    "vld1.u8 {d4}, [%[dest]]\n"
+	    /* XXX: one cycle stall */
+	    "vrhadd.u8 d0, d0, d2\n"
+	    /* XXX: three cycles stall */
+	    "vrhadd.u8 d0, d4, d0\n"
+	    "vst1.u8 {d0}, [%[dest]]\n"
+	    :
+	    : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1),
+		       [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1)
+	    : "memory", "q0", "q1", "d4");
+	ref += stride;
+	dest += stride;
+    } while (--height);
+}
+
+MPEG2_MC_EXTERN (neon)
+
+#endif /* ARCH_ARM_NEON */

Modification de propriétés sur libmpeg2/motion_comp_neon.c
___________________________________________________________________
Ajouté : svn:eol-style
   + native

Index: libmpeg2/mpeg2_internal.h
===================================================================
--- libmpeg2/mpeg2_internal.h	(révision 1204)
+++ libmpeg2/mpeg2_internal.h	(copie de travail)
@@ -330,5 +330,6 @@
 extern mpeg2_mc_t mpeg2_mc_alpha;
 extern mpeg2_mc_t mpeg2_mc_vis;
 extern mpeg2_mc_t mpeg2_mc_arm;
+extern mpeg2_mc_t mpeg2_mc_neon;
 
 #endif /* LIBMPEG2_MPEG2_INTERNAL_H */
Index: libmpeg2/motion_comp.c
===================================================================
--- libmpeg2/motion_comp.c	(révision 1204)
+++ libmpeg2/motion_comp.c	(copie de travail)
@@ -58,6 +58,11 @@
     else
 #endif
 #ifdef ARCH_ARM
+#ifdef ARCH_ARM_NEON
+    if (accel & MPEG2_ACCEL_ARM_NEON)
+        mpeg2_mc = mpeg2_mc_neon;
+    else
+#endif
     if (accel & MPEG2_ACCEL_ARM)
 	mpeg2_mc = mpeg2_mc_arm;
     else
Index: libmpeg2/Makefile.am
===================================================================
--- libmpeg2/Makefile.am	(révision 1204)
+++ libmpeg2/Makefile.am	(copie de travail)
@@ -14,7 +14,7 @@
 			  motion_comp_vis.c motion_comp_arm.c \
 			  cpu_accel.c cpu_state.c
 if ARCH_ARM
-libmpeg2arch_la_SOURCES += motion_comp_arm_s.S
+libmpeg2arch_la_SOURCES += motion_comp_arm_s.S motion_comp_neon.c
 endif
 libmpeg2arch_la_CFLAGS = $(OPT_CFLAGS) $(ARCH_OPT_CFLAGS) $(LIBMPEG2_CFLAGS)
 
Index: configure.ac
===================================================================
--- configure.ac	(révision 1204)
+++ configure.ac	(copie de travail)
@@ -103,7 +103,14 @@
 	AC_DEFINE([ARCH_ALPHA],,[alpha architecture]);;
     arm*)
 	arm_conditional=:
-	AC_DEFINE([ARCH_ARM],,[ARM architecture]);;
+	AC_DEFINE([ARCH_ARM],,[ARM architecture])
+        AC_MSG_CHECKING([if inline ARM Advanced SIMD assembly is supported])
+        AC_TRY_COMPILE([],
+           [asm ("vqmovun.s64 d0, q1":::"d0");],
+           [AC_DEFINE([ARCH_ARM_NEON],, [ARM Advanced SIMD assembly])
+            AC_MSG_RESULT(yes)],
+           [AC_MSG_RESULT(no)])
+        ;;
     esac
 elif test x"$CC" = x"tendracc"; then
     dnl TenDRA portability checking compiler

------------------------------------------------------------------------------
Come build with us! The BlackBerry&reg; Developer Conference in SF, CA
is the only developer event you need to attend this year. Jumpstart your
developing skills, take BlackBerry mobile applications to market and stay 
ahead of the curve. Join us from November 9&#45;12, 2009. Register now&#33;
http://p.sf.net/sfu/devconf

_______________________________________________
Libmpeg2-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/libmpeg2-devel

[mpeg2-dev] [RFC] [PATCH] ARM Advanced SIMD motion compensation

Reply via email to