[libav-devel] [PATCH 23/29] alpha: hpeldsp: Move half-pel assembly from dsputil to hpeldsp

Martin Storsjö Wed, 10 Apr 2013 09:25:01 -0700

From: "Ronald S. Bultje" <[email protected]>

---
 libavcodec/alpha/Makefile                          |    2 +
 libavcodec/alpha/dsputil_alpha.c                   |  185 --------------------
 libavcodec/alpha/dsputil_alpha.h                   |    2 -
 libavcodec/alpha/dsputil_alpha_asm.S               |   92 ----------
 .../alpha/{dsputil_alpha.c => hpeldsp_alpha.c}     |  136 +-------------
 .../{bfin/vp3_bfin.h => alpha/hpeldsp_alpha.h}     |   10 +-
 libavcodec/alpha/hpeldsp_alpha_asm.S               |  135 ++++++++++++++
 libavcodec/hpeldsp.c                               |    1 +
 libavcodec/hpeldsp.h                               |    1 +
 9 files changed, 147 insertions(+), 417 deletions(-)
 copy libavcodec/alpha/{dsputil_alpha.c => hpeldsp_alpha.c} (70%)
 copy libavcodec/{bfin/vp3_bfin.h => alpha/hpeldsp_alpha.h} (76%)
 create mode 100644 libavcodec/alpha/hpeldsp_alpha_asm.S


diff --git a/libavcodec/alpha/Makefile b/libavcodec/alpha/Makefile
index e28200d..6f22137 100644
--- a/libavcodec/alpha/Makefile
+++ b/libavcodec/alpha/Makefile
@@ -4,4 +4,6 @@ OBJS += alpha/dsputil_alpha.o                                   
        \
         alpha/motion_est_mvi_asm.o                                      \
         alpha/simple_idct_alpha.o                                       \
 
+OBJS-$(CONFIG_HPELDSP)                  += alpha/hpeldsp_alpha.o        \
+                                           alpha/hpeldsp_alpha_asm.o
 OBJS-$(CONFIG_MPEGVIDEO)                += alpha/mpegvideo_alpha.o
diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/dsputil_alpha.c
index 34fe2eb..7a41cb8 100644
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/dsputil_alpha.c
@@ -119,196 +119,11 @@ static void clear_blocks_axp(int16_t *blocks) {
     } while (n);
 }
 
-static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
-{
-    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
-}
-
-static inline uint64_t avg2(uint64_t a, uint64_t b)
-{
-    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
-}
-
-#if 0
-/* The XY2 routines basically utilize this scheme, but reuse parts in
-   each iteration.  */
-static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
-{
-    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
-                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
-                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
-                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
-    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
-                    + (l2 & BYTE_VEC(0x03))
-                    + (l3 & BYTE_VEC(0x03))
-                    + (l4 & BYTE_VEC(0x03))
-                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
-    return r1 + r2;
-}
-#endif
-
-#define OP(LOAD, STORE)                         \
-    do {                                        \
-        STORE(LOAD(pixels), block);             \
-        pixels += line_size;                    \
-        block += line_size;                     \
-    } while (--h)
-
-#define OP_X2(LOAD, STORE)                                      \
-    do {                                                        \
-        uint64_t pix1, pix2;                                    \
-                                                                \
-        pix1 = LOAD(pixels);                                    \
-        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
-        STORE(AVG2(pix1, pix2), block);                         \
-        pixels += line_size;                                    \
-        block += line_size;                                     \
-    } while (--h)
-
-#define OP_Y2(LOAD, STORE)                      \
-    do {                                        \
-        uint64_t pix = LOAD(pixels);            \
-        do {                                    \
-            uint64_t next_pix;                  \
-                                                \
-            pixels += line_size;                \
-            next_pix = LOAD(pixels);            \
-            STORE(AVG2(pix, next_pix), block);  \
-            block += line_size;                 \
-            pix = next_pix;                     \
-        } while (--h);                          \
-    } while (0)
-
-#define OP_XY2(LOAD, STORE)                                                 \
-    do {                                                                    \
-        uint64_t pix1 = LOAD(pixels);                                       \
-        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
-        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
-                       + (pix2 & BYTE_VEC(0x03));                           \
-        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
-                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
-                                                                            \
-        do {                                                                \
-            uint64_t npix1, npix2;                                          \
-            uint64_t npix_l, npix_h;                                        \
-            uint64_t avg;                                                   \
-                                                                            \
-            pixels += line_size;                                            \
-            npix1 = LOAD(pixels);                                           \
-            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
-            npix_l = (npix1 & BYTE_VEC(0x03))                               \
-                   + (npix2 & BYTE_VEC(0x03));                              \
-            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
-                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
-            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
-                + pix_h + npix_h;                                           \
-            STORE(avg, block);                                              \
-                                                                            \
-            block += line_size;                                             \
-            pix_l = npix_l;                                                 \
-            pix_h = npix_h;                                                 \
-        } while (--h);                                                      \
-    } while (0)
-
-#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
-static void OPNAME ## _pixels ## SUFF ## _axp                               \
-        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
-         ptrdiff_t line_size, int h)                                        \
-{                                                                           \
-    if ((size_t) pixels & 0x7) {                                            \
-        OPKIND(uldq, STORE);                                                \
-    } else {                                                                \
-        OPKIND(ldq, STORE);                                                 \
-    }                                                                       \
-}                                                                           \
-                                                                            \
-static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
-        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
-         ptrdiff_t line_size, int h)                                        \
-{                                                                           \
-    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
-    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
-}
-
-#define PIXOP(OPNAME, STORE)                    \
-    MAKE_OP(OPNAME, ,     OP,     STORE)        \
-    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
-    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
-    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
-
-/* Rounding primitives.  */
-#define AVG2 avg2
-#define AVG4 avg4
-#define AVG4_ROUNDER BYTE_VEC(0x02)
-#define STORE(l, b) stq(l, b)
-PIXOP(put, STORE);
-
-#undef STORE
-#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-PIXOP(avg, STORE);
-
-/* Not rounding primitives.  */
-#undef AVG2
-#undef AVG4
-#undef AVG4_ROUNDER
-#undef STORE
-#define AVG2 avg2_no_rnd
-#define AVG4 avg4_no_rnd
-#define AVG4_ROUNDER BYTE_VEC(0x01)
-#define STORE(l, b) stq(l, b)
-PIXOP(put_no_rnd, STORE);
-
-#undef STORE
-#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-PIXOP(avg_no_rnd, STORE);
-
-static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
-                                 ptrdiff_t line_size, int h)
-{
-    put_pixels_axp_asm(block,     pixels,     line_size, h);
-    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
-}
-
 av_cold void ff_dsputil_init_alpha(DSPContext *c, AVCodecContext *avctx)
 {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
     if (!high_bit_depth) {
-    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
-    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
-    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
-    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
-
-    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
-    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
-    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
-    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
-
-    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
-    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
-    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
-    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
-
-    c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
-    c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
-    c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
-    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
-
-    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
-    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
-    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
-    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
-
-    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
-    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
-    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
-    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
-
-    c->avg_pixels_tab[1][0] = avg_pixels_axp;
-    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
-    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
-    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
-
     c->clear_blocks = clear_blocks_axp;
     }
 
diff --git a/libavcodec/alpha/dsputil_alpha.h b/libavcodec/alpha/dsputil_alpha.h
index fcea47c..d976c18 100644
--- a/libavcodec/alpha/dsputil_alpha.h
+++ b/libavcodec/alpha/dsputil_alpha.h
@@ -26,8 +26,6 @@ void ff_simple_idct_axp(int16_t *block);
 void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block);
 void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block);
 
-void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h);
 void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
                                 int line_size);
 void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
diff --git a/libavcodec/alpha/dsputil_alpha_asm.S 
b/libavcodec/alpha/dsputil_alpha_asm.S
index 1f589aa..fe7e124 100644
--- a/libavcodec/alpha/dsputil_alpha_asm.S
+++ b/libavcodec/alpha/dsputil_alpha_asm.S
@@ -43,98 +43,6 @@
         .text
 
 /************************************************************************
- * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
- *                         int line_size, int h)
- */
-        .align 6
-        .globl put_pixels_axp_asm
-        .ent put_pixels_axp_asm
-put_pixels_axp_asm:
-        .frame sp, 0, ra
-        .prologue 0
-
-        and     a1, 7, t0
-        beq     t0, $aligned
-
-        .align 4
-$unaligned:
-        ldq_u   t0, 0(a1)
-        ldq_u   t1, 8(a1)
-        addq    a1, a2, a1
-        nop
-
-        ldq_u   t2, 0(a1)
-        ldq_u   t3, 8(a1)
-        addq    a1, a2, a1
-        nop
-
-        ldq_u   t4, 0(a1)
-        ldq_u   t5, 8(a1)
-        addq    a1, a2, a1
-        nop
-
-        ldq_u   t6, 0(a1)
-        ldq_u   t7, 8(a1)
-        extql   t0, a1, t0
-        addq    a1, a2, a1
-
-        extqh   t1, a1, t1
-        addq    a0, a2, t8
-        extql   t2, a1, t2
-        addq    t8, a2, t9
-
-        extqh   t3, a1, t3
-        addq    t9, a2, ta
-        extql   t4, a1, t4
-        or      t0, t1, t0
-
-        extqh   t5, a1, t5
-        or      t2, t3, t2
-        extql   t6, a1, t6
-        or      t4, t5, t4
-
-        extqh   t7, a1, t7
-        or      t6, t7, t6
-        stq     t0, 0(a0)
-        stq     t2, 0(t8)
-
-        stq     t4, 0(t9)
-        subq    a3, 4, a3
-        stq     t6, 0(ta)
-        addq    ta, a2, a0
-
-        bne     a3, $unaligned
-        ret
-
-        .align 4
-$aligned:
-        ldq     t0, 0(a1)
-        addq    a1, a2, a1
-        ldq     t1, 0(a1)
-        addq    a1, a2, a1
-
-        ldq     t2, 0(a1)
-        addq    a1, a2, a1
-        ldq     t3, 0(a1)
-
-        addq    a0, a2, t4
-        addq    a1, a2, a1
-        addq    t4, a2, t5
-        subq    a3, 4, a3
-
-        stq     t0, 0(a0)
-        addq    t5, a2, t6
-        stq     t1, 0(t4)
-        addq    t6, a2, a0
-
-        stq     t2, 0(t5)
-        stq     t3, 0(t6)
-
-        bne     a3, $aligned
-        ret
-        .end put_pixels_axp_asm
-
-/************************************************************************
  * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
  *                                 int line_size)
  */
diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/hpeldsp_alpha.c
similarity index 70%
copy from libavcodec/alpha/dsputil_alpha.c
copy to libavcodec/alpha/hpeldsp_alpha.c
index 34fe2eb..1f1699f 100644
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/hpeldsp_alpha.c
@@ -19,106 +19,10 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libavutil/attributes.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_alpha.h"
+#include "libavcodec/hpeldsp.h"
+#include "hpeldsp_alpha.h"
 #include "asm.h"
 
-void (*put_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
-                                 int line_size);
-void (*add_pixels_clamped_axp_p)(const int16_t *block, uint8_t *pixels,
-                                 int line_size);
-
-#if 0
-/* These functions were the base for the optimized assembler routines,
-   and remain here for documentation purposes.  */
-static void put_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
-                                   ptrdiff_t line_size)
-{
-    int i = 8;
-    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
-
-    do {
-        uint64_t shorts0, shorts1;
-
-        shorts0 = ldq(block);
-        shorts0 = maxsw4(shorts0, 0);
-        shorts0 = minsw4(shorts0, clampmask);
-        stl(pkwb(shorts0), pixels);
-
-        shorts1 = ldq(block + 4);
-        shorts1 = maxsw4(shorts1, 0);
-        shorts1 = minsw4(shorts1, clampmask);
-        stl(pkwb(shorts1), pixels + 4);
-
-        pixels += line_size;
-        block += 8;
-    } while (--i);
-}
-
-void add_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels,
-                            ptrdiff_t line_size)
-{
-    int h = 8;
-    /* Keep this function a leaf function by generating the constants
-       manually (mainly for the hack value ;-).  */
-    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
-    uint64_t signmask  = zap(-1, 0x33);
-    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
-
-    do {
-        uint64_t shorts0, pix0, signs0;
-        uint64_t shorts1, pix1, signs1;
-
-        shorts0 = ldq(block);
-        shorts1 = ldq(block + 4);
-
-        pix0    = unpkbw(ldl(pixels));
-        /* Signed subword add (MMX paddw).  */
-        signs0  = shorts0 & signmask;
-        shorts0 &= ~signmask;
-        shorts0 += pix0;
-        shorts0 ^= signs0;
-        /* Clamp. */
-        shorts0 = maxsw4(shorts0, 0);
-        shorts0 = minsw4(shorts0, clampmask);
-
-        /* Next 4.  */
-        pix1    = unpkbw(ldl(pixels + 4));
-        signs1  = shorts1 & signmask;
-        shorts1 &= ~signmask;
-        shorts1 += pix1;
-        shorts1 ^= signs1;
-        shorts1 = maxsw4(shorts1, 0);
-        shorts1 = minsw4(shorts1, clampmask);
-
-        stl(pkwb(shorts0), pixels);
-        stl(pkwb(shorts1), pixels + 4);
-
-        pixels += line_size;
-        block += 8;
-    } while (--h);
-}
-#endif
-
-static void clear_blocks_axp(int16_t *blocks) {
-    uint64_t *p = (uint64_t *) blocks;
-    int n = sizeof(int16_t) * 6 * 64;
-
-    do {
-        p[0] = 0;
-        p[1] = 0;
-        p[2] = 0;
-        p[3] = 0;
-        p[4] = 0;
-        p[5] = 0;
-        p[6] = 0;
-        p[7] = 0;
-        p += 8;
-        n -= 8 * 8;
-    } while (n);
-}
-
 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
 {
     return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
@@ -269,11 +173,8 @@ static void put_pixels16_axp_asm(uint8_t *block, const 
uint8_t *pixels,
     put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
 }
 
-av_cold void ff_dsputil_init_alpha(DSPContext *c, AVCodecContext *avctx)
+av_cold void ff_hpeldsp_init_alpha(HpelDSPContext* c, int flags)
 {
-    const int high_bit_depth = avctx->bits_per_raw_sample > 8;
-
-    if (!high_bit_depth) {
     c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
     c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
     c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
@@ -308,35 +209,4 @@ av_cold void ff_dsputil_init_alpha(DSPContext *c, 
AVCodecContext *avctx)
     c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
     c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
     c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
-
-    c->clear_blocks = clear_blocks_axp;
-    }
-
-    /* amask clears all bits that correspond to present features.  */
-    if (amask(AMASK_MVI) == 0) {
-        c->put_pixels_clamped = put_pixels_clamped_mvi_asm;
-        c->add_pixels_clamped = add_pixels_clamped_mvi_asm;
-
-        if (!high_bit_depth)
-            c->get_pixels   = get_pixels_mvi;
-        c->diff_pixels      = diff_pixels_mvi;
-        c->sad[0]           = pix_abs16x16_mvi_asm;
-        c->sad[1]           = pix_abs8x8_mvi;
-        c->pix_abs[0][0]    = pix_abs16x16_mvi_asm;
-        c->pix_abs[1][0]    = pix_abs8x8_mvi;
-        c->pix_abs[0][1]    = pix_abs16x16_x2_mvi;
-        c->pix_abs[0][2]    = pix_abs16x16_y2_mvi;
-        c->pix_abs[0][3]    = pix_abs16x16_xy2_mvi;
-    }
-
-    put_pixels_clamped_axp_p = c->put_pixels_clamped;
-    add_pixels_clamped_axp_p = c->add_pixels_clamped;
-
-    if (avctx->bits_per_raw_sample <= 8 &&
-        (avctx->idct_algo == FF_IDCT_AUTO ||
-         avctx->idct_algo == FF_IDCT_SIMPLEALPHA)) {
-        c->idct_put = ff_simple_idct_put_axp;
-        c->idct_add = ff_simple_idct_add_axp;
-        c->idct =     ff_simple_idct_axp;
-    }
 }
diff --git a/libavcodec/bfin/vp3_bfin.h b/libavcodec/alpha/hpeldsp_alpha.h
similarity index 76%
copy from libavcodec/bfin/vp3_bfin.h
copy to libavcodec/alpha/hpeldsp_alpha.h
index 5a2c5a4..304cf2c 100644
--- a/libavcodec/bfin/vp3_bfin.h
+++ b/libavcodec/alpha/hpeldsp_alpha.h
@@ -16,12 +16,12 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-
-#ifndef AVCODEC_BFIN_VP3_BFIN_H
-#define AVCODEC_BFIN_VP3_BFIN_H
+#ifndef AVCODEC_ALPHA_HPELDSP_ALPHA_H
+#define AVCODEC_ALPHA_HPELDSP_ALPHA_H
 
 #include <stdint.h>
 
-void ff_bfin_vp3_idct(int16_t *block);
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
 
-#endif /* AVCODEC_BFIN_VP3_BFIN_H */
+#endif /* AVCODEC_ALPHA_HPELDSP_ALPHA_H */
diff --git a/libavcodec/alpha/hpeldsp_alpha_asm.S 
b/libavcodec/alpha/hpeldsp_alpha_asm.S
new file mode 100644
index 0000000..122b542
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha_asm.S
@@ -0,0 +1,135 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+ *                         int line_size, int h)
+ */
+        .align 6
+        .globl put_pixels_axp_asm
+        .ent put_pixels_axp_asm
+put_pixels_axp_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        and     a1, 7, t0
+        beq     t0, $aligned
+
+        .align 4
+$unaligned:
+        ldq_u   t0, 0(a1)
+        ldq_u   t1, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t2, 0(a1)
+        ldq_u   t3, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t4, 0(a1)
+        ldq_u   t5, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t6, 0(a1)
+        ldq_u   t7, 8(a1)
+        extql   t0, a1, t0
+        addq    a1, a2, a1
+
+        extqh   t1, a1, t1
+        addq    a0, a2, t8
+        extql   t2, a1, t2
+        addq    t8, a2, t9
+
+        extqh   t3, a1, t3
+        addq    t9, a2, ta
+        extql   t4, a1, t4
+        or      t0, t1, t0
+
+        extqh   t5, a1, t5
+        or      t2, t3, t2
+        extql   t6, a1, t6
+        or      t4, t5, t4
+
+        extqh   t7, a1, t7
+        or      t6, t7, t6
+        stq     t0, 0(a0)
+        stq     t2, 0(t8)
+
+        stq     t4, 0(t9)
+        subq    a3, 4, a3
+        stq     t6, 0(ta)
+        addq    ta, a2, a0
+
+        bne     a3, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        ldq     t0, 0(a1)
+        addq    a1, a2, a1
+        ldq     t1, 0(a1)
+        addq    a1, a2, a1
+
+        ldq     t2, 0(a1)
+        addq    a1, a2, a1
+        ldq     t3, 0(a1)
+
+        addq    a0, a2, t4
+        addq    a1, a2, a1
+        addq    t4, a2, t5
+        subq    a3, 4, a3
+
+        stq     t0, 0(a0)
+        addq    t5, a2, t6
+        stq     t1, 0(t4)
+        addq    t6, a2, a0
+
+        stq     t2, 0(t5)
+        stq     t3, 0(t6)
+
+        bne     a3, $aligned
+        ret
+        .end put_pixels_axp_asm
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index 4b7623e..76ca85d 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -54,6 +54,7 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
     hpel_funcs(avg, [3],  2);
     hpel_funcs(avg_no_rnd,, 16);
 
+    if (ARCH_ALPHA)      ff_hpeldsp_init_alpha (c, flags);
     if (ARCH_ARM)        ff_hpeldsp_init_arm   (c, flags);
     if (ARCH_BFIN)       ff_hpeldsp_init_bfin  (c, flags);
     if (ARCH_PPC)        ff_hpeldsp_init_ppc   (c, flags);
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index 6c267cf..d304ec2 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -94,6 +94,7 @@ typedef struct HpelDSPContext {
 
 void ff_hpeldsp_init(HpelDSPContext *c, int flags);
 
+void ff_hpeldsp_init_alpha(HpelDSPContext* c, int flags);
 void ff_hpeldsp_init_arm(HpelDSPContext* c, int flags);
 void ff_hpeldsp_init_bfin(HpelDSPContext* c, int flags);
 void ff_hpeldsp_init_ppc(HpelDSPContext* c, int flags);
-- 
1.7.9.4

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 23/29] alpha: hpeldsp: Move half-pel assembly from dsputil to hpeldsp

Reply via email to