[libav-devel] [PATCH 113/132] dsputil: Move pix_sum, pix_norm1, shrink function pointers to mpegvideoenc

Diego Biurrun Wed, 02 Jul 2014 11:52:32 -0700

---

Could be folded into the previous commit ..


 libavcodec/arm/Makefile                            |   2 +
 libavcodec/arm/dsputil_armv6.S                     |  55 ----------
 libavcodec/arm/dsputil_init_armv6.c                |   6 --
 libavcodec/arm/mpegvideoencdsp_armv6.S             |  76 ++++++++++++++
 .../mpegvideoencdsp_init_arm.c}                    |  34 +++----
 libavcodec/bfin/Makefile                           |   2 +
 libavcodec/bfin/dsputil.S                          |  91 -----------------
 libavcodec/bfin/dsputil_init.c                     |   6 --
 libavcodec/bfin/mpegvideoencdsp.S                  | 113 +++++++++++++++++++++
 .../mpegvideoencdsp_init.c}                        |  38 +++----
 libavcodec/dnxhdenc.c                              |   5 +-
 libavcodec/dsputil.c                               |  79 --------------
 libavcodec/dsputil.h                               |   6 --
 libavcodec/motion_est.c                            |   5 +-
 libavcodec/mpegvideo_enc.c                         |  31 +++---
 libavcodec/mpegvideoencdsp.c                       |  85 ++++++++++++++++
 libavcodec/mpegvideoencdsp.h                       |  12 ++-
 libavcodec/ppc/Makefile                            |   1 +
 libavcodec/ppc/dsputil_altivec.c                   |  60 -----------
 libavcodec/ppc/mpegvideoencdsp_altivec.c           | 102 +++++++++++++++++++
 libavcodec/svq1enc.c                               |   1 +
 libavcodec/x86/Makefile                            |   4 +-
 libavcodec/x86/dsputilenc.asm                      |  69 -------------
 libavcodec/x86/dsputilenc_mmx.c                    |   4 -
 libavcodec/x86/mpegvideoencdsp.asm                 |  95 +++++++++++++++++
 .../{mpegvideoencdsp.c => mpegvideoencdsp_init.c}  |  11 +-
 26 files changed, 555 insertions(+), 438 deletions(-)
 create mode 100644 libavcodec/arm/mpegvideoencdsp_armv6.S
 copy libavcodec/{mpegvideoencdsp.h => arm/mpegvideoencdsp_init_arm.c} (54%)
 create mode 100644 libavcodec/bfin/mpegvideoencdsp.S
 copy libavcodec/{mpegvideoencdsp.h => bfin/mpegvideoencdsp_init.c} (52%)
 create mode 100644 libavcodec/ppc/mpegvideoencdsp_altivec.c
 create mode 100644 libavcodec/x86/mpegvideoencdsp.asm
 rename libavcodec/x86/{mpegvideoencdsp.c => mpegvideoencdsp_init.c} (94%)

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index eba975b..7d81e99 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -20,6 +20,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += 
arm/idctdsp_init_arm.o        \
                                           arm/simple_idct_arm.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
 OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
+OBJS-$(CONFIG_MPEGVIDEOENC)            += arm/mpegvideoencdsp_init_arm.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)       += arm/neontest.o
 OBJS-$(CONFIG_VECTORDSP)               += arm/vectordsp_init_arm.o
 OBJS-$(CONFIG_VIDEODSP)                += arm/videodsp_init_arm.o
@@ -56,6 +57,7 @@ ARMV6-OBJS-$(CONFIG_IDCTDSP)           += 
arm/idctdsp_init_armv6.o      \
                                           arm/idctdsp_armv6.o           \
                                           arm/simple_idct_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
+ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC)      += arm/mpegvideoencdsp_armv6.o
 
 ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o               \
                                           arm/vp8dsp_init_armv6.o       \
diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S
index b89171f..8ff0e4a 100644
--- a/libavcodec/arm/dsputil_armv6.S
+++ b/libavcodec/arm/dsputil_armv6.S
@@ -297,58 +297,3 @@ function ff_sse16_armv6, export=1
 
         pop             {r4-r9, pc}
 endfunc
-
-function ff_pix_norm1_armv6, export=1
-        push            {r4-r6, lr}
-        mov             r12, #16
-        mov             lr,  #0
-1:
-        ldm             r0,  {r2-r5}
-        uxtb16          r6,  r2
-        uxtb16          r2,  r2,  ror #8
-        smlad           lr,  r6,  r6,  lr
-        uxtb16          r6,  r3
-        smlad           lr,  r2,  r2,  lr
-        uxtb16          r3,  r3,  ror #8
-        smlad           lr,  r6,  r6,  lr
-        uxtb16          r6,  r4
-        smlad           lr,  r3,  r3,  lr
-        uxtb16          r4,  r4,  ror #8
-        smlad           lr,  r6,  r6,  lr
-        uxtb16          r6,  r5
-        smlad           lr,  r4,  r4,  lr
-        uxtb16          r5,  r5,  ror #8
-        smlad           lr,  r6,  r6,  lr
-        subs            r12, r12, #1
-        add             r0,  r0,  r1
-        smlad           lr,  r5,  r5,  lr
-        bgt             1b
-
-        mov             r0,  lr
-        pop             {r4-r6, pc}
-endfunc
-
-function ff_pix_sum_armv6, export=1
-        push            {r4-r7, lr}
-        mov             r12, #16
-        mov             r2,  #0
-        mov             r3,  #0
-        mov             lr,  #0
-        ldr             r4,  [r0]
-1:
-        subs            r12, r12, #1
-        ldr             r5,  [r0, #4]
-        usada8          r2,  r4,  lr,  r2
-        ldr             r6,  [r0, #8]
-        usada8          r3,  r5,  lr,  r3
-        ldr             r7,  [r0, #12]
-        usada8          r2,  r6,  lr,  r2
-        beq             2f
-        ldr_pre         r4,  r0,  r1
-        usada8          r3,  r7,  lr,  r3
-        bgt             1b
-2:
-        usada8          r3,  r7,  lr,  r3
-        add             r0,  r2,  r3
-        pop             {r4-r7, pc}
-endfunc
diff --git a/libavcodec/arm/dsputil_init_armv6.c 
b/libavcodec/arm/dsputil_init_armv6.c
index fab5e0d..85ee8e1 100644
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -43,9 +43,6 @@ int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, 
uint8_t *blk2,
 int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
                    int line_size, int h);
 
-int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
-int ff_pix_sum_armv6(uint8_t *pix, int line_size);
-
 av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx,
                                    unsigned high_bit_depth)
 {
@@ -63,7 +60,4 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, 
AVCodecContext *avctx,
     c->sad[1] = ff_pix_abs8_armv6;
 
     c->sse[0] = ff_sse16_armv6;
-
-    c->pix_norm1 = ff_pix_norm1_armv6;
-    c->pix_sum   = ff_pix_sum_armv6;
 }
diff --git a/libavcodec/arm/mpegvideoencdsp_armv6.S 
b/libavcodec/arm/mpegvideoencdsp_armv6.S
new file mode 100644
index 0000000..99db501
--- /dev/null
+++ b/libavcodec/arm/mpegvideoencdsp_armv6.S
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_pix_norm1_armv6, export=1
+        push            {r4-r6, lr}
+        mov             r12, #16
+        mov             lr,  #0
+1:
+        ldm             r0,  {r2-r5}
+        uxtb16          r6,  r2
+        uxtb16          r2,  r2,  ror #8
+        smlad           lr,  r6,  r6,  lr
+        uxtb16          r6,  r3
+        smlad           lr,  r2,  r2,  lr
+        uxtb16          r3,  r3,  ror #8
+        smlad           lr,  r6,  r6,  lr
+        uxtb16          r6,  r4
+        smlad           lr,  r3,  r3,  lr
+        uxtb16          r4,  r4,  ror #8
+        smlad           lr,  r6,  r6,  lr
+        uxtb16          r6,  r5
+        smlad           lr,  r4,  r4,  lr
+        uxtb16          r5,  r5,  ror #8
+        smlad           lr,  r6,  r6,  lr
+        subs            r12, r12, #1
+        add             r0,  r0,  r1
+        smlad           lr,  r5,  r5,  lr
+        bgt             1b
+
+        mov             r0,  lr
+        pop             {r4-r6, pc}
+endfunc
+
+function ff_pix_sum_armv6, export=1
+        push            {r4-r7, lr}
+        mov             r12, #16
+        mov             r2,  #0
+        mov             r3,  #0
+        mov             lr,  #0
+        ldr             r4,  [r0]
+1:
+        subs            r12, r12, #1
+        ldr             r5,  [r0, #4]
+        usada8          r2,  r4,  lr,  r2
+        ldr             r6,  [r0, #8]
+        usada8          r3,  r5,  lr,  r3
+        ldr             r7,  [r0, #12]
+        usada8          r2,  r6,  lr,  r2
+        beq             2f
+        ldr_pre         r4,  r0,  r1
+        usada8          r3,  r7,  lr,  r3
+        bgt             1b
+2:
+        usada8          r3,  r7,  lr,  r3
+        add             r0,  r2,  r3
+        pop             {r4-r7, pc}
+endfunc
diff --git a/libavcodec/mpegvideoencdsp.h 
b/libavcodec/arm/mpegvideoencdsp_init_arm.c
similarity index 54%
copy from libavcodec/mpegvideoencdsp.h
copy to libavcodec/arm/mpegvideoencdsp_init_arm.c
index c321698..ab9ba3e 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c
@@ -16,27 +16,23 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_MPEGVIDEOENCDSP_H
-#define AVCODEC_MPEGVIDEOENCDSP_H
-
 #include <stdint.h>
 
-#include "avcodec.h"
-
-#define BASIS_SHIFT 16
-#define RECON_SHIFT 6
-
-typedef struct MpegvideoEncDSPContext {
-    int (*try_8x8basis)(int16_t rem[64], int16_t weight[64],
-                        int16_t basis[64], int scale);
-    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
-
-} MpegvideoEncDSPContext;
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/mpegvideoencdsp.h"
 
-void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
-                             AVCodecContext *avctx);
-void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
-                                 AVCodecContext *avctx);
+int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
+int ff_pix_sum_armv6(uint8_t *pix, int line_size);
 
+av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
+                                         AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
 
-#endif /* AVCODEC_MPEGVIDEOENCDSP_H */
+    if (have_armv6(cpu_flags)) {
+        c->pix_norm1 = ff_pix_norm1_armv6;
+        c->pix_sum   = ff_pix_sum_armv6;
+    }
+}
diff --git a/libavcodec/bfin/Makefile b/libavcodec/bfin/Makefile
index 8bfa4dd..daa58d0 100644
--- a/libavcodec/bfin/Makefile
+++ b/libavcodec/bfin/Makefile
@@ -9,5 +9,7 @@ OBJS-$(CONFIG_HPELDSP)                  += bfin/hpeldsp_init.o  
        \
 OBJS-$(CONFIG_IDCTDSP)                  += bfin/idctdsp_init.o          \
                                            bfin/idctdsp.o               \
                                            bfin/pixels_clamped.o
+OBJS-$(CONFIG_MPEGVIDEOENC)             += bfin/mpegvideoencdsp_init.o  \
+                                           bfin/mpegvideoencdsp.o
 OBJS-$(CONFIG_VP3DSP)                   += bfin/vp3dsp_init.o           \
                                            bfin/vp3dsp.o
diff --git a/libavcodec/bfin/dsputil.S b/libavcodec/bfin/dsputil.S
index b1c3563..d169bad 100644
--- a/libavcodec/bfin/dsputil.S
+++ b/libavcodec/bfin/dsputil.S
@@ -49,53 +49,6 @@ DEFUN(diff_pixels,mL1,
         rts;
 DEFUN_END(diff_pixels)
 
-/*
-    for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++) {
-          sum += pix[j];
-        }
-        pix += line_size;
-    }
-*/
-DEFUN(pix_sum,mL1,
-        (uint8_t *p, int stride)):
-        link 0;
-        [--sp] = (r7:4);
-        p0=8;
-        i0=r0;        // s1
-        i1=r0;
-        m1=r1;
-        r1=r1+r1;
-        r1+=-16;       // stride
-        m0=r1;
-        i1+=m1;
-
-        r6=0;
-
-        LSETUP(LS$PS,LE$PS) LC0=P0;
-        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
-
-LS$PS:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
-        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++]   || R2 = [I1++];
-        r6=r6+|+r5;
-        r6=r6+|+r4;
-        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++]   || R3 = [I1++];
-        r6=r6+|+r5;
-        r6=r6+|+r4;
-        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++m0] || R2 = [I1++m0];
-        r6=r6+|+r5;
-        r6=r6+|+r4;
-        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++]   || R2 = [I1++];
-        r6=r6+|+r5;
-LE$PS:  r6=r6+|+r4;
-        r0.l=r6.l+r6.h;
-        r0.h=0;
-
-        (r7:4) = [sp++];
-        unlink;
-        rts;
-DEFUN_END(pix_sum)
-
 
 DEFUN(get_pixels,mL1,
         (int16_t *restrict block, const uint8_t *pixels, int line_size)):
@@ -181,50 +134,6 @@ e$8:    DISALGNEXCPT         || R1 = [I0++]   || R3 = 
[I1++];
         RTS;
 DEFUN_END(z_sad8x8)
 
-DEFUN(pix_norm1,mL1,
-        (uint8_t * pix, int line_size)):
-        [--SP]=(R7:4,P5:3);
-
-        // Fetch the input arguments.
-        P1 = R0;  // pix
-        P0 = R1;  // line_size
-        P5 = 16;  // loop ctr.
-        P0 -= P5;
-        M0 = P0;  // M0 = line_size-16;
-        // Now for the real work.
-        A1 = A0 = 0;
-        lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
-        I0 = P1;
-        DISALGNEXCPT || r0 = [i0++];
-
-_pix_norm1_blkfn_loopStart:
-        // following unpacks pix1[0..15] pix1+line_size[0..15]
-        DISALGNEXCPT || r1 = [i0++];
-
-        (r5, r4) = byteunpack r1:0 || r0 = [i0++];
-        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
-        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
-        (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
-        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
-        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
-        (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
-        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
-        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
-        (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
-        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
-_pix_norm1_blkfn_loopEnd:
-        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
-
-
-// Clean up at the end:
-        R2 = A0, R3 = A1;
-        R0 = R2 + R3 (S);
-
-        (R7:4,P5:3)=[SP++];
-
-        RTS;
-DEFUN_END(pix_norm1)
-
 DEFUN(sse4,mL1,
         (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
         link 0;
diff --git a/libavcodec/bfin/dsputil_init.c b/libavcodec/bfin/dsputil_init.c
index 5c33d9c..aa14a81 100644
--- a/libavcodec/bfin/dsputil_init.c
+++ b/libavcodec/bfin/dsputil_init.c
@@ -37,9 +37,6 @@ void ff_bfin_diff_pixels(int16_t *block, const uint8_t *s1, 
const uint8_t *s2,
 void ff_bfin_get_pixels(int16_t *restrict block, const uint8_t *pixels,
                         int line_size) attribute_l1_text;
 
-int ff_bfin_pix_norm1(uint8_t *pix, int line_size) attribute_l1_text;
-int ff_bfin_pix_sum(uint8_t *p, int stride) attribute_l1_text;
-
 int ff_bfin_z_sad8x8(uint8_t *blk1, uint8_t *blk2, int dsz,
                      int line_size, int h) attribute_l1_text;
 int ff_bfin_z_sad16x16(uint8_t *blk1, uint8_t *blk2, int dsz,
@@ -123,9 +120,6 @@ av_cold void ff_dsputil_init_bfin(DSPContext *c, 
AVCodecContext *avctx,
 {
     c->diff_pixels = ff_bfin_diff_pixels;
 
-    c->pix_sum   = ff_bfin_pix_sum;
-    c->pix_norm1 = ff_bfin_pix_norm1;
-
     c->sad[0] = bfin_pix_abs16;
     c->sad[1] = bfin_pix_abs8;
 
diff --git a/libavcodec/bfin/mpegvideoencdsp.S 
b/libavcodec/bfin/mpegvideoencdsp.S
new file mode 100644
index 0000000..83b1f18
--- /dev/null
+++ b/libavcodec/bfin/mpegvideoencdsp.S
@@ -0,0 +1,113 @@
+/*
+ * Blackfin Pixel Operations
+ * Copyright (C) 2007 Marc Hoffman <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/bfin/asm.h"
+
+/*
+    for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++) {
+          sum += pix[j];
+        }
+        pix += line_size;
+    }
+*/
+DEFUN(pix_sum,mL1,
+        (uint8_t *p, int stride)):
+        link 0;
+        [--sp] = (r7:4);
+        p0=8;
+        i0=r0;        // s1
+        i1=r0;
+        m1=r1;
+        r1=r1+r1;
+        r1+=-16;       // stride
+        m0=r1;
+        i1+=m1;
+
+        r6=0;
+
+        LSETUP(LS$PS,LE$PS) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+
+LS$PS:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
+        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++]   || R2 = [I1++];
+        r6=r6+|+r5;
+        r6=r6+|+r4;
+        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++]   || R3 = [I1++];
+        r6=r6+|+r5;
+        r6=r6+|+r4;
+        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++m0] || R2 = [I1++m0];
+        r6=r6+|+r5;
+        r6=r6+|+r4;
+        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++]   || R2 = [I1++];
+        r6=r6+|+r5;
+LE$PS:  r6=r6+|+r4;
+        r0.l=r6.l+r6.h;
+        r0.h=0;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+DEFUN_END(pix_sum)
+
+DEFUN(pix_norm1,mL1,
+        (uint8_t * pix, int line_size)):
+        [--SP]=(R7:4,P5:3);
+
+        // Fetch the input arguments.
+        P1 = R0;  // pix
+        P0 = R1;  // line_size
+        P5 = 16;  // loop ctr.
+        P0 -= P5;
+        M0 = P0;  // M0 = line_size-16;
+        // Now for the real work.
+        A1 = A0 = 0;
+        lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
+        I0 = P1;
+        DISALGNEXCPT || r0 = [i0++];
+
+_pix_norm1_blkfn_loopStart:
+        // following unpacks pix1[0..15] pix1+line_size[0..15]
+        DISALGNEXCPT || r1 = [i0++];
+
+        (r5, r4) = byteunpack r1:0 || r0 = [i0++];
+        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
+        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
+        (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
+        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
+        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
+        (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
+        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
+        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
+        (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
+        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
+_pix_norm1_blkfn_loopEnd:
+        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
+
+
+// Clean up at the end:
+        R2 = A0, R3 = A1;
+        R0 = R2 + R3 (S);
+
+        (R7:4,P5:3)=[SP++];
+
+        RTS;
+DEFUN_END(pix_norm1)
diff --git a/libavcodec/mpegvideoencdsp.h 
b/libavcodec/bfin/mpegvideoencdsp_init.c
similarity index 52%
copy from libavcodec/mpegvideoencdsp.h
copy to libavcodec/bfin/mpegvideoencdsp_init.c
index c321698..d459ada 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/bfin/mpegvideoencdsp_init.c
@@ -16,27 +16,17 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef AVCODEC_MPEGVIDEOENCDSP_H
-#define AVCODEC_MPEGVIDEOENCDSP_H
-
-#include <stdint.h>
-
-#include "avcodec.h"
-
-#define BASIS_SHIFT 16
-#define RECON_SHIFT 6
-
-typedef struct MpegvideoEncDSPContext {
-    int (*try_8x8basis)(int16_t rem[64], int16_t weight[64],
-                        int16_t basis[64], int scale);
-    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
-
-} MpegvideoEncDSPContext;
-
-void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
-                             AVCodecContext *avctx);
-void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
-                                 AVCodecContext *avctx);
-
-
-#endif /* AVCODEC_MPEGVIDEOENCDSP_H */
+#include "libavutil/attributes.h"
+#include "libavutil/bfin/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/mpegvideoencdsp.h"
+
+int ff_bfin_pix_norm1(uint8_t *pix, int line_size) attribute_l1_text;
+int ff_bfin_pix_sum(uint8_t *p, int stride) attribute_l1_text;
+
+av_cold void ff_mpegvideoencdsp_init_bfin(MpegvideoEncDSPContext *c,
+                                          AVCodecContext *avctx)
+{
+    c->pix_norm1 = ff_bfin_pix_norm1;
+    c->pix_sum   = ff_bfin_pix_sum;
+}
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index b5b7f55..0e46ac0 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -266,6 +266,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx)
     ff_blockdsp_init(&ctx->bdsp, avctx);
     ff_dsputil_init(&ctx->m.dsp, avctx);
     ff_idctdsp_init(&ctx->m.idsp, avctx);
+    ff_mpegvideoencdsp_init(&ctx->m.mvedsp, avctx);
     ff_dct_common_init(&ctx->m);
     if (!ctx->m.dct_quantize)
         ctx->m.dct_quantize = ff_dct_quantize_c;
@@ -636,8 +637,8 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void 
*arg, int jobnr, int
             int varc;
 
             if (!partial_last_row && mb_x * 16 <= avctx->width - 16) {
-                sum  = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
-                varc = ctx->m.dsp.pix_norm1(pix, ctx->m.linesize);
+                sum  = ctx->m.mvedsp.pix_sum(pix, ctx->m.linesize);
+                varc = ctx->m.mvedsp.pix_norm1(pix, ctx->m.linesize);
             } else {
                 int bw = FFMIN(avctx->width - 16 * mb_x, 16);
                 int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 
16);
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 1fd9013..81dce9f 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -26,15 +26,12 @@
  */
 
 #include "libavutil/attributes.h"
-#include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "copy_block.h"
 #include "dct.h"
 #include "dsputil.h"
 #include "simple_idct.h"
 #include "faandct.h"
-#include "imgconvert.h"
-#include "mathops.h"
 #include "mpegvideo.h"
 #include "config.h"
 
@@ -47,74 +44,6 @@ uint32_t ff_square_tab[512] = { 0, };
 #define BIT_DEPTH 8
 #include "dsputilenc_template.c"
 
-static int pix_sum_c(uint8_t *pix, int line_size)
-{
-    int s = 0, i, j;
-
-    for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j += 8) {
-            s   += pix[0];
-            s   += pix[1];
-            s   += pix[2];
-            s   += pix[3];
-            s   += pix[4];
-            s   += pix[5];
-            s   += pix[6];
-            s   += pix[7];
-            pix += 8;
-        }
-        pix += line_size - 16;
-    }
-    return s;
-}
-
-static int pix_norm1_c(uint8_t *pix, int line_size)
-{
-    int s = 0, i, j;
-    uint32_t *sq = ff_square_tab + 256;
-
-    for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j += 8) {
-#if 0
-            s += sq[pix[0]];
-            s += sq[pix[1]];
-            s += sq[pix[2]];
-            s += sq[pix[3]];
-            s += sq[pix[4]];
-            s += sq[pix[5]];
-            s += sq[pix[6]];
-            s += sq[pix[7]];
-#else
-#if HAVE_FAST_64BIT
-            register uint64_t x = *(uint64_t *) pix;
-            s += sq[x         & 0xff];
-            s += sq[(x >>  8) & 0xff];
-            s += sq[(x >> 16) & 0xff];
-            s += sq[(x >> 24) & 0xff];
-            s += sq[(x >> 32) & 0xff];
-            s += sq[(x >> 40) & 0xff];
-            s += sq[(x >> 48) & 0xff];
-            s += sq[(x >> 56) & 0xff];
-#else
-            register uint32_t x = *(uint32_t *) pix;
-            s += sq[x         & 0xff];
-            s += sq[(x >>  8) & 0xff];
-            s += sq[(x >> 16) & 0xff];
-            s += sq[(x >> 24) & 0xff];
-            x  = *(uint32_t *) (pix + 4);
-            s += sq[x         & 0xff];
-            s += sq[(x >>  8) & 0xff];
-            s += sq[(x >> 16) & 0xff];
-            s += sq[(x >> 24) & 0xff];
-#endif
-#endif
-            pix += 8;
-        }
-        pix += line_size - 16;
-    }
-    return s;
-}
-
 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                   int line_size, int h)
 {
@@ -1061,9 +990,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext 
*avctx)
 
     c->sum_abs_dctelem = sum_abs_dctelem_c;
 
-    c->pix_sum   = pix_sum_c;
-    c->pix_norm1 = pix_norm1_c;
-
     /* TODO [0] 16  [1] 8 */
     c->pix_abs[0][0] = pix_abs16_c;
     c->pix_abs[0][1] = pix_abs16_x2_c;
@@ -1103,11 +1029,6 @@ av_cold void ff_dsputil_init(DSPContext *c, 
AVCodecContext *avctx)
     c->nsse[0] = nsse16_c;
     c->nsse[1] = nsse8_c;
 
-    c->shrink[0] = av_image_copy_plane;
-    c->shrink[1] = ff_shrink22;
-    c->shrink[2] = ff_shrink44;
-    c->shrink[3] = ff_shrink88;
-
     c->draw_edges = draw_edges_8_c;
 
     switch (avctx->bits_per_raw_sample) {
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 27a9031..b4f6834 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -58,9 +58,6 @@ typedef struct DSPContext {
                         int stride);
     int (*sum_abs_dctelem)(int16_t *block /* align 16 */);
 
-    int (*pix_sum)(uint8_t *pix, int line_size);
-    int (*pix_norm1)(uint8_t *pix, int line_size);
-
     me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
     me_cmp_func sse[6];
     me_cmp_func hadamard8_diff[6];
@@ -92,9 +89,6 @@ typedef struct DSPContext {
 #define EDGE_WIDTH 16
 #define EDGE_TOP    1
 #define EDGE_BOTTOM 2
-
-    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
-                      int src_wrap, int width, int height);
 } DSPContext;
 
 void ff_dsputil_init(DSPContext *p, AVCodecContext *avctx);
diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c
index 2f4655a..7e6aed1 100644
--- a/libavcodec/motion_est.c
+++ b/libavcodec/motion_est.c
@@ -880,8 +880,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s,
 
     /* intra / predictive decision */
     pix = c->src[0][0];
-    sum = s->dsp.pix_sum(pix, s->linesize);
-    varc = s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500;
+    sum  = s->mvedsp.pix_sum(pix, s->linesize);
+    varc = s->mvedsp.pix_norm1(pix, s->linesize) -
+           (((unsigned) sum * sum) >> 8) + 500;
 
     pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8;
     pic->mb_var [s->mb_stride * mb_y + mb_x] = (varc+128)>>8;
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index e2e0fc9..4b935be 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -869,7 +869,7 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src,
             int offset = x + y * stride;
             int sad  = s->dsp.sad[0](NULL, src + offset, ref + offset, stride,
                                      16);
-            int mean = (s->dsp.pix_sum(src + offset, stride) + 128) >> 8;
+            int mean = (s->mvedsp.pix_sum(src + offset, stride) + 128) >> 8;
             int sae  = get_sae(src + offset, mean, stride);
 
             acc += sae + 500 < sad;
@@ -1113,15 +1113,21 @@ static int estimate_best_b_count(MpegEncContext *s)
                 pre_input.f.data[2] += INPLACE_OFFSET;
             }
 
-            s->dsp.shrink[scale](s->tmp_frames[i]->data[0], 
s->tmp_frames[i]->linesize[0],
-                                 pre_input.f.data[0], pre_input.f.linesize[0],
-                                 c->width,      c->height);
-            s->dsp.shrink[scale](s->tmp_frames[i]->data[1], 
s->tmp_frames[i]->linesize[1],
-                                 pre_input.f.data[1], pre_input.f.linesize[1],
-                                 c->width >> 1, c->height >> 1);
-            s->dsp.shrink[scale](s->tmp_frames[i]->data[2], 
s->tmp_frames[i]->linesize[2],
-                                 pre_input.f.data[2], pre_input.f.linesize[2],
-                                 c->width >> 1, c->height >> 1);
+            s->mvedsp.shrink[scale](s->tmp_frames[i]->data[0],
+                                    s->tmp_frames[i]->linesize[0],
+                                    pre_input.f.data[0],
+                                    pre_input.f.linesize[0],
+                                    c->width, c->height);
+            s->mvedsp.shrink[scale](s->tmp_frames[i]->data[1],
+                                    s->tmp_frames[i]->linesize[1],
+                                    pre_input.f.data[1],
+                                    pre_input.f.linesize[1],
+                                    c->width >> 1, c->height >> 1);
+            s->mvedsp.shrink[scale](s->tmp_frames[i]->data[2],
+                                    s->tmp_frames[i]->linesize[2],
+                                    pre_input.f.data[2],
+                                    pre_input.f.linesize[2],
+                                    c->width >> 1, c->height >> 1);
         }
     }
 
@@ -2395,9 +2401,10 @@ static int mb_var_thread(AVCodecContext *c, void *arg){
             int yy = mb_y * 16;
             uint8_t *pix = s->new_picture.f.data[0] + (yy * s->linesize) + xx;
             int varc;
-            int sum = s->dsp.pix_sum(pix, s->linesize);
+            int sum = s->mvedsp.pix_sum(pix, s->linesize);
 
-            varc = (s->dsp.pix_norm1(pix, s->linesize) - 
(((unsigned)sum*sum)>>8) + 500 + 128)>>8;
+            varc = (s->mvedsp.pix_norm1(pix, s->linesize) -
+                    (((unsigned) sum * sum) >> 8) + 500 + 128) >> 8;
 
             s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc;
             s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = 
(sum+128)>>8;
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index 16ff1f2..d827186 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -21,7 +21,10 @@
 
 #include "config.h"
 #include "libavutil/attributes.h"
+#include "libavutil/imgutils.h"
 #include "avcodec.h"
+#include "dsputil.h"
+#include "imgconvert.h"
 #include "mpegvideoencdsp.h"
 
 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
@@ -53,12 +56,94 @@ static void add_8x8basis_c(int16_t rem[64], int16_t 
basis[64], int scale)
                   (BASIS_SHIFT - RECON_SHIFT);
 }
 
+static int pix_sum_c(uint8_t *pix, int line_size)
+{
+    int s = 0, i, j;
+
+    for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j += 8) {
+            s   += pix[0];
+            s   += pix[1];
+            s   += pix[2];
+            s   += pix[3];
+            s   += pix[4];
+            s   += pix[5];
+            s   += pix[6];
+            s   += pix[7];
+            pix += 8;
+        }
+        pix += line_size - 16;
+    }
+    return s;
+}
+
+static int pix_norm1_c(uint8_t *pix, int line_size)
+{
+    int s = 0, i, j;
+    uint32_t *sq = ff_square_tab + 256;
+
+    for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j += 8) {
+#if 0
+            s += sq[pix[0]];
+            s += sq[pix[1]];
+            s += sq[pix[2]];
+            s += sq[pix[3]];
+            s += sq[pix[4]];
+            s += sq[pix[5]];
+            s += sq[pix[6]];
+            s += sq[pix[7]];
+#else
+#if HAVE_FAST_64BIT
+            register uint64_t x = *(uint64_t *) pix;
+            s += sq[x         & 0xff];
+            s += sq[(x >>  8) & 0xff];
+            s += sq[(x >> 16) & 0xff];
+            s += sq[(x >> 24) & 0xff];
+            s += sq[(x >> 32) & 0xff];
+            s += sq[(x >> 40) & 0xff];
+            s += sq[(x >> 48) & 0xff];
+            s += sq[(x >> 56) & 0xff];
+#else
+            register uint32_t x = *(uint32_t *) pix;
+            s += sq[x         & 0xff];
+            s += sq[(x >>  8) & 0xff];
+            s += sq[(x >> 16) & 0xff];
+            s += sq[(x >> 24) & 0xff];
+            x  = *(uint32_t *) (pix + 4);
+            s += sq[x         & 0xff];
+            s += sq[(x >>  8) & 0xff];
+            s += sq[(x >> 16) & 0xff];
+            s += sq[(x >> 24) & 0xff];
+#endif
+#endif
+            pix += 8;
+        }
+        pix += line_size - 16;
+    }
+    return s;
+}
+
 av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                                      AVCodecContext *avctx)
 {
     c->try_8x8basis = try_8x8basis_c;
     c->add_8x8basis = add_8x8basis_c;
 
+    c->shrink[0] = av_image_copy_plane;
+    c->shrink[1] = ff_shrink22;
+    c->shrink[2] = ff_shrink44;
+    c->shrink[3] = ff_shrink88;
+
+    c->pix_sum   = pix_sum_c;
+    c->pix_norm1 = pix_norm1_c;
+
+    if (ARCH_ARM)
+        ff_mpegvideoencdsp_init_arm(c, avctx);
+    if (ARCH_BFIN)
+        ff_mpegvideoencdsp_init_bfin(c, avctx);
+    if (ARCH_PPC)
+        ff_mpegvideoencdsp_init_ppc(c, avctx);
     if (ARCH_X86)
         ff_mpegvideoencdsp_init_x86(c, avctx);
 }
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index c321698..4e144ed 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -31,12 +31,22 @@ typedef struct MpegvideoEncDSPContext {
                         int16_t basis[64], int scale);
     void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
 
+    int (*pix_sum)(uint8_t *pix, int line_size);
+    int (*pix_norm1)(uint8_t *pix, int line_size);
+
+    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src,
+                      int src_wrap, int width, int height);
 } MpegvideoEncDSPContext;
 
 void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                              AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
+                                 AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_bfin(MpegvideoEncDSPContext *c,
+                                  AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
+                                 AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
 
-
 #endif /* AVCODEC_MPEGVIDEOENCDSP_H */
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index 952d756..607a334 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -12,6 +12,7 @@ OBJS-$(CONFIG_IDCTDSP)                 += 
ppc/idctdsp_altivec.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += ppc/mpegaudiodsp_altivec.o
 OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o      \
                                           ppc/mpegvideodsp_altivec.o
+OBJS-$(CONFIG_MPEGVIDEOENC)            += ppc/mpegvideoencdsp_altivec.o
 OBJS-$(CONFIG_VECTORDSP)               += ppc/vectordsp_altivec.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c
index c3f90e9..efb4a18 100644
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -308,34 +308,6 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, 
uint8_t *pix2,
     return s;
 }
 
-static int pix_norm1_altivec(uint8_t *pix, int line_size)
-{
-    int i, s = 0;
-    const vector unsigned int zero =
-        (const vector unsigned int) vec_splat_u32(0);
-    vector unsigned char perm = vec_lvsl(0, pix);
-    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
-    vector signed int sum;
-
-    for (i = 0; i < 16; i++) {
-        /* Read the potentially unaligned pixels. */
-        vector unsigned char pixl = vec_ld(0,  pix);
-        vector unsigned char pixr = vec_ld(15, pix);
-        vector unsigned char pixv = vec_perm(pixl, pixr, perm);
-
-        /* Square the values, and add them to our sum. */
-        sv = vec_msum(pixv, pixv, sv);
-
-        pix += line_size;
-    }
-    /* Sum up the four partial sums, and put the result into s. */
-    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
-    sum = vec_splat(sum, 3);
-    vec_ste(sum, 0, &s);
-
-    return s;
-}
-
 /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
  * It's the sad8_altivec code above w/ squaring added. */
 static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
@@ -430,35 +402,6 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, 
uint8_t *pix2,
     return s;
 }
 
-static int pix_sum_altivec(uint8_t *pix, int line_size)
-{
-    int i, s;
-    const vector unsigned int zero =
-        (const vector unsigned int) vec_splat_u32(0);
-    vector unsigned char perm = vec_lvsl(0, pix);
-    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
-    vector signed int sumdiffs;
-
-    for (i = 0; i < 16; i++) {
-        /* Read the potentially unaligned 16 pixels into t1. */
-        vector unsigned char pixl = vec_ld(0,  pix);
-        vector unsigned char pixr = vec_ld(15, pix);
-        vector unsigned char t1   = vec_perm(pixl, pixr, perm);
-
-        /* Add each 4 pixel group together and put 4 results into sad. */
-        sad = vec_sum4s(t1, sad);
-
-        pix += line_size;
-    }
-
-    /* Sum up the four partial sums, and put the result into s. */
-    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
-    sumdiffs = vec_splat(sumdiffs, 3);
-    vec_ste(sumdiffs, 0, &s);
-
-    return s;
-}
-
 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
                                int line_size)
 {
@@ -911,9 +854,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, 
AVCodecContext *avctx,
     c->sse[0] = sse16_altivec;
     c->sse[1] = sse8_altivec;
 
-    c->pix_norm1 = pix_norm1_altivec;
-    c->pix_sum   = pix_sum_altivec;
-
     c->diff_pixels = diff_pixels_altivec;
 
     if (!high_bit_depth) {
diff --git a/libavcodec/ppc/mpegvideoencdsp_altivec.c 
b/libavcodec/ppc/mpegvideoencdsp_altivec.c
new file mode 100644
index 0000000..398f3a9
--- /dev/null
+++ b/libavcodec/ppc/mpegvideoencdsp_altivec.c
@@ -0,0 +1,102 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include <stdint.h>
+#if HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/ppc/types_altivec.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavcodec/mpegvideoencdsp.h"
+
+#if HAVE_ALTIVEC
+
+static int pix_norm1_altivec(uint8_t *pix, int line_size)
+{
+    int i, s = 0;
+    const vector unsigned int zero =
+        (const vector unsigned int) vec_splat_u32(0);
+    vector unsigned char perm = vec_lvsl(0, pix);
+    vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
+    vector signed int sum;
+
+    for (i = 0; i < 16; i++) {
+        /* Read the potentially unaligned pixels. */
+        vector unsigned char pixl = vec_ld(0,  pix);
+        vector unsigned char pixr = vec_ld(15, pix);
+        vector unsigned char pixv = vec_perm(pixl, pixr, perm);
+
+        /* Square the values, and add them to our sum. */
+        sv = vec_msum(pixv, pixv, sv);
+
+        pix += line_size;
+    }
+    /* Sum up the four partial sums, and put the result into s. */
+    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
+    sum = vec_splat(sum, 3);
+    vec_ste(sum, 0, &s);
+
+    return s;
+}
+
+static int pix_sum_altivec(uint8_t *pix, int line_size)
+{
+    int i, s;
+    const vector unsigned int zero =
+        (const vector unsigned int) vec_splat_u32(0);
+    vector unsigned char perm = vec_lvsl(0, pix);
+    vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
+    vector signed int sumdiffs;
+
+    for (i = 0; i < 16; i++) {
+        /* Read the potentially unaligned 16 pixels into t1. */
+        vector unsigned char pixl = vec_ld(0,  pix);
+        vector unsigned char pixr = vec_ld(15, pix);
+        vector unsigned char t1   = vec_perm(pixl, pixr, perm);
+
+        /* Add each 4 pixel group together and put 4 results into sad. */
+        sad = vec_sum4s(t1, sad);
+
+        pix += line_size;
+    }
+
+    /* Sum up the four partial sums, and put the result into s. */
+    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
+    sumdiffs = vec_splat(sumdiffs, 3);
+    vec_ste(sumdiffs, 0, &s);
+
+    return s;
+}
+
+#endif /* HAVE_ALTIVEC */
+
+av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
+                                         AVCodecContext *avctx)
+{
+#if HAVE_ALTIVEC
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
+        return;
+
+    c->pix_norm1 = pix_norm1_altivec;
+    c->pix_sum   = pix_sum_altivec;
+#endif /* HAVE_ALTIVEC */
+}
diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c
index fd1ff27..68e1641 100644
--- a/libavcodec/svq1enc.c
+++ b/libavcodec/svq1enc.c
@@ -506,6 +506,7 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
 
     ff_dsputil_init(&s->dsp, avctx);
     ff_hpeldsp_init(&s->hdsp, avctx->flags);
+    ff_mpegvideoencdsp_init(&s->m.mvedsp, avctx);
 
     avctx->coded_frame = av_frame_alloc();
     s->current_picture = av_frame_alloc();
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 56a4e56..54e25d6 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -23,7 +23,7 @@ OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
                                           x86/mpegvideodsp.o
 OBJS-$(CONFIG_MPEGVIDEOENC)            += x86/mpegvideoenc.o           \
-                                          x86/mpegvideoencdsp.o
+                                          x86/mpegvideoencdsp_init.o
 OBJS-$(CONFIG_QPELDSP)                 += x86/qpeldsp_init.o
 OBJS-$(CONFIG_VECTORDSP)               += x86/vectordsp_init.o
 OBJS-$(CONFIG_VIDEODSP)                += x86/videodsp_init.o
@@ -90,6 +90,8 @@ YASM-OBJS-$(CONFIG_HPELDSP)            += x86/fpel.o          
          \
                                           x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_HUFFYUVDSP)         += x86/huffyuvdsp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
+YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoenc.o            \
+                                          x86/mpegvideoencdsp.o
 YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
                                           x86/fpel.o                    \
                                           x86/qpel.o
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index f676ba6..d996962 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -418,72 +418,3 @@ cglobal diff_pixels, 4,5
     add          r4, 16
     jne .loop
     REP_RET
-
-INIT_MMX mmx
-; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
-    movsxdifnidn r1, r1d
-    mov          r2, r1
-    neg          r2
-    shl          r2, 4
-    sub          r0, r2
-    pxor         m7, m7
-    pxor         m6, m6
-.loop:
-    mova         m0, [r0+r2+0]
-    mova         m1, [r0+r2+0]
-    mova         m2, [r0+r2+8]
-    mova         m3, [r0+r2+8]
-    punpcklbw    m0, m7
-    punpckhbw    m1, m7
-    punpcklbw    m2, m7
-    punpckhbw    m3, m7
-    paddw        m1, m0
-    paddw        m3, m2
-    paddw        m3, m1
-    paddw        m6, m3
-    add          r2, r1
-    js .loop
-    mova         m5, m6
-    psrlq        m6, 32
-    paddw        m6, m5
-    mova         m5, m6
-    psrlq        m6, 16
-    paddw        m6, m5
-    movd        eax, m6
-    and         eax, 0xffff
-    RET
-
-INIT_MMX mmx
-; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
-    movsxdifnidn r1, r1d
-    mov          r2, 16
-    pxor         m0, m0
-    pxor         m7, m7
-.loop:
-    mova         m2, [r0+0]
-    mova         m3, [r0+8]
-    mova         m1, m2
-    punpckhbw    m1, m0
-    punpcklbw    m2, m0
-    mova         m4, m3
-    punpckhbw    m3, m0
-    punpcklbw    m4, m0
-    pmaddwd      m1, m1
-    pmaddwd      m2, m2
-    pmaddwd      m3, m3
-    pmaddwd      m4, m4
-    paddd        m2, m1
-    paddd        m4, m3
-    paddd        m7, m2
-    add          r0, r1
-    paddd        m7, m4
-    dec r2
-    jne .loop
-    mova         m1, m7
-    psrlq        m7, 32
-    paddd        m1, m7
-    movd        eax, m1
-    RET
-
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 7af0913..563543e 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -35,8 +35,6 @@ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, 
int line_size);
 void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         int stride);
-int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
-int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
 
 #if HAVE_INLINE_ASM
 
@@ -831,8 +829,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, 
AVCodecContext *avctx,
         if (!high_bit_depth)
             c->get_pixels = ff_get_pixels_mmx;
         c->diff_pixels = ff_diff_pixels_mmx;
-        c->pix_sum     = ff_pix_sum16_mmx;
-        c->pix_norm1   = ff_pix_norm1_mmx;
     }
 
     if (EXTERNAL_SSE2(cpu_flags))
diff --git a/libavcodec/x86/mpegvideoencdsp.asm 
b/libavcodec/x86/mpegvideoencdsp.asm
new file mode 100644
index 0000000..9326ee7
--- /dev/null
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -0,0 +1,95 @@
+;*****************************************************************************
+;* SIMD-optimized MPEG encoding functions
+;*****************************************************************************
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <[email protected]>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_MMX mmx
+; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
+cglobal pix_sum16, 2, 3
+    movsxdifnidn r1, r1d
+    mov          r2, r1
+    neg          r2
+    shl          r2, 4
+    sub          r0, r2
+    pxor         m7, m7
+    pxor         m6, m6
+.loop:
+    mova         m0, [r0+r2+0]
+    mova         m1, [r0+r2+0]
+    mova         m2, [r0+r2+8]
+    mova         m3, [r0+r2+8]
+    punpcklbw    m0, m7
+    punpckhbw    m1, m7
+    punpcklbw    m2, m7
+    punpckhbw    m3, m7
+    paddw        m1, m0
+    paddw        m3, m2
+    paddw        m3, m1
+    paddw        m6, m3
+    add          r2, r1
+    js .loop
+    mova         m5, m6
+    psrlq        m6, 32
+    paddw        m6, m5
+    mova         m5, m6
+    psrlq        m6, 16
+    paddw        m6, m5
+    movd        eax, m6
+    and         eax, 0xffff
+    RET
+
+INIT_MMX mmx
+; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
+cglobal pix_norm1, 2, 4
+    movsxdifnidn r1, r1d
+    mov          r2, 16
+    pxor         m0, m0
+    pxor         m7, m7
+.loop:
+    mova         m2, [r0+0]
+    mova         m3, [r0+8]
+    mova         m1, m2
+    punpckhbw    m1, m0
+    punpcklbw    m2, m0
+    mova         m4, m3
+    punpckhbw    m3, m0
+    punpcklbw    m4, m0
+    pmaddwd      m1, m1
+    pmaddwd      m2, m2
+    pmaddwd      m3, m3
+    pmaddwd      m4, m4
+    paddd        m2, m1
+    paddd        m4, m3
+    paddd        m7, m2
+    add          r0, r1
+    paddd        m7, m4
+    dec r2
+    jne .loop
+    mova         m1, m7
+    psrlq        m7, 32
+    paddd        m1, m7
+    movd        eax, m1
+    RET
+
diff --git a/libavcodec/x86/mpegvideoencdsp.c 
b/libavcodec/x86/mpegvideoencdsp_init.c
similarity index 94%
rename from libavcodec/x86/mpegvideoencdsp.c
rename to libavcodec/x86/mpegvideoencdsp_init.c
index db2c37f..4ef2f34 100644
--- a/libavcodec/x86/mpegvideoencdsp.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -22,6 +22,9 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
+int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
+int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
+
 #if HAVE_INLINE_ASM
 
 #define PHADDD(a, t)                            \
@@ -95,9 +98,15 @@
 av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
                                          AVCodecContext *avctx)
 {
-#if HAVE_INLINE_ASM
     int cpu_flags = av_get_cpu_flags();
 
+    if (EXTERNAL_MMX(cpu_flags)) {
+        c->pix_sum   = ff_pix_sum16_mmx;
+        c->pix_norm1 = ff_pix_norm1_mmx;
+    }
+
+#if HAVE_INLINE_ASM
+
     if (INLINE_MMX(cpu_flags)) {
         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
             c->try_8x8basis = try_8x8basis_mmx;
-- 
1.8.3.2

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 113/132] dsputil: Move pix_sum, pix_norm1, shrink function pointers to mpegvideoenc

Reply via email to