---
libavcodec/alpha/dsputil_alpha.c | 72 ---------
libavcodec/alpha/motion_est_alpha.c | 43 -----
libavcodec/cabac.h | 83 ----------
libavcodec/celp_filters.c | 9 -
libavcodec/dwt.c | 8 -
libavcodec/sh4/idct_sh4.c | 151 ------------------
libavcodec/x86/idct_mmx.c | 112 --------------
libavcodec/x86/mpegvideo_mmx_template.c | 11 --
libavcodec/x86/simple_idct_mmx.c | 256 -------------------------------
9 files changed, 0 insertions(+), 745 deletions(-)
diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/dsputil_alpha.c
index 610f92a..d20bd76 100644
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/dsputil_alpha.c
@@ -28,78 +28,6 @@ void (*put_pixels_clamped_axp_p)(const DCTELEM *block,
uint8_t *pixels,
void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
int line_size);
-#if 0
-/* These functions were the base for the optimized assembler routines,
- and remain here for documentation purposes. */
-static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
- int line_size)
-{
- int i = 8;
- uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
-
- do {
- uint64_t shorts0, shorts1;
-
- shorts0 = ldq(block);
- shorts0 = maxsw4(shorts0, 0);
- shorts0 = minsw4(shorts0, clampmask);
- stl(pkwb(shorts0), pixels);
-
- shorts1 = ldq(block + 4);
- shorts1 = maxsw4(shorts1, 0);
- shorts1 = minsw4(shorts1, clampmask);
- stl(pkwb(shorts1), pixels + 4);
-
- pixels += line_size;
- block += 8;
- } while (--i);
-}
-
-void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
- int line_size)
-{
- int h = 8;
- /* Keep this function a leaf function by generating the constants
- manually (mainly for the hack value ;-). */
- uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
- uint64_t signmask = zap(-1, 0x33);
- signmask ^= signmask >> 1; /* 0x8000800080008000 */
-
- do {
- uint64_t shorts0, pix0, signs0;
- uint64_t shorts1, pix1, signs1;
-
- shorts0 = ldq(block);
- shorts1 = ldq(block + 4);
-
- pix0 = unpkbw(ldl(pixels));
- /* Signed subword add (MMX paddw). */
- signs0 = shorts0 & signmask;
- shorts0 &= ~signmask;
- shorts0 += pix0;
- shorts0 ^= signs0;
- /* Clamp. */
- shorts0 = maxsw4(shorts0, 0);
- shorts0 = minsw4(shorts0, clampmask);
-
- /* Next 4. */
- pix1 = unpkbw(ldl(pixels + 4));
- signs1 = shorts1 & signmask;
- shorts1 &= ~signmask;
- shorts1 += pix1;
- shorts1 ^= signs1;
- shorts1 = maxsw4(shorts1, 0);
- shorts1 = minsw4(shorts1, clampmask);
-
- stl(pkwb(shorts0), pixels);
- stl(pkwb(shorts1), pixels + 4);
-
- pixels += line_size;
- block += 8;
- } while (--h);
-}
-#endif
-
static void clear_blocks_axp(DCTELEM *blocks) {
uint64_t *p = (uint64_t *) blocks;
int n = sizeof(DCTELEM) * 6 * 64;
diff --git a/libavcodec/alpha/motion_est_alpha.c
b/libavcodec/alpha/motion_est_alpha.c
index bb9ab13..99bcea2 100644
--- a/libavcodec/alpha/motion_est_alpha.c
+++ b/libavcodec/alpha/motion_est_alpha.c
@@ -119,49 +119,6 @@ int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h)
return result;
}
-#if 0 /* now done in assembly */
-int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
-{
- int result = 0;
- int h = 16;
-
- if ((size_t) pix2 & 0x7) {
- /* works only when pix2 is actually unaligned */
- do { /* do 16 pixel a time */
- uint64_t p1_l, p1_r, p2_l, p2_r;
- uint64_t t;
-
- p1_l = ldq(pix1);
- p1_r = ldq(pix1 + 8);
- t = ldq_u(pix2 + 8);
- p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
- p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
- pix1 += line_size;
- pix2 += line_size;
-
- result += perr(p1_l, p2_l)
- + perr(p1_r, p2_r);
- } while (--h);
- } else {
- do {
- uint64_t p1_l, p1_r, p2_l, p2_r;
-
- p1_l = ldq(pix1);
- p1_r = ldq(pix1 + 8);
- p2_l = ldq(pix2);
- p2_r = ldq(pix2 + 8);
- pix1 += line_size;
- pix2 += line_size;
-
- result += perr(p1_l, p2_l)
- + perr(p1_r, p2_r);
- } while (--h);
- }
-
- return result;
-}
-#endif
-
int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size,
int h)
{
int result = 0;
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 52c723c..9b916bf 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -302,25 +302,6 @@ static inline void renorm_cabac_decoder(CABACContext *c){
static inline void renorm_cabac_decoder_once(CABACContext *c){
#ifdef ARCH_X86_DISABLED
int temp;
-#if 0
- //P3:683 athlon:475
- __asm__(
- "lea -0x100(%0), %2 \n\t"
- "shr $31, %2 \n\t" //FIXME 31->63 for x86-64
- "shl %%cl, %0 \n\t"
- "shl %%cl, %1 \n\t"
- : "+r"(c->range), "+r"(c->low), "+c"(temp)
- );
-#elif 0
- //P3:680 athlon:474
- __asm__(
- "cmp $0x100, %0 \n\t"
- "setb %%cl \n\t" //FIXME 31->63 for x86-64
- "shl %%cl, %0 \n\t"
- "shl %%cl, %1 \n\t"
- : "+r"(c->range), "+r"(c->low), "+c"(temp)
- );
-#elif 1
int temp2;
//P3:665 athlon:517
__asm__(
@@ -333,31 +314,6 @@ static inline void renorm_cabac_decoder_once(CABACContext
*c){
"add %%edx, %1 \n\t"
: "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
);
-#elif 0
- int temp2;
- //P3:673 athlon:509
- __asm__(
- "cmp $0x100, %0 \n\t"
- "sbb %%edx, %%edx \n\t"
- "mov %0, %%eax \n\t"
- "and %%edx, %0 \n\t"
- "and %1, %%edx \n\t"
- "add %%eax, %0 \n\t"
- "add %%edx, %1 \n\t"
- : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
- );
-#else
- int temp2;
- //P3:677 athlon:511
- __asm__(
- "cmp $0x100, %0 \n\t"
- "lea (%0, %0), %%eax \n\t"
- "lea (%1, %1), %%edx \n\t"
- "cmovb %%eax, %0 \n\t"
- "cmovb %%edx, %1 \n\t"
- : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
- );
-#endif
#else
//P3:675 athlon:476
int shift= (uint32_t)(c->range - 0x100)>>31;
@@ -399,19 +355,11 @@ static av_always_inline int get_cabac_inline(CABACContext
*c, uint8_t * const st
"cmpl %%ecx, %%ebx \n\t"
" ja 1f \n\t"
-#if 1
//athlon:4067 P3:4110
"lea -0x100(%%edx), %%ecx \n\t"
"shr $31, %%ecx \n\t"
"shl %%cl, %%edx \n\t"
"shl %%cl, %%ebx \n\t"
-#else
- //athlon:4057 P3:4130
- "cmp $0x100, %%edx \n\t" //FIXME avoidable
- "setb %%cl \n\t"
- "shl %%cl, %%edx \n\t"
- "shl %%cl, %%ebx \n\t"
-#endif
"movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t"
"movb %%cl, (%1) \n\t"
//eax:state ebx:low, edx:range, esi:RangeLPS
@@ -589,36 +537,6 @@ static int av_unused get_cabac(CABACContext *c, uint8_t *
const state){
}
static int av_unused get_cabac_bypass(CABACContext *c){
-#if 0 //not faster
- int bit;
- __asm__ volatile(
- "movl "RANGE "(%1), %%ebx \n\t"
- "movl "LOW "(%1), %%eax \n\t"
- "shl $17, %%ebx \n\t"
- "add %%eax, %%eax \n\t"
- "sub %%ebx, %%eax \n\t"
- "cltd \n\t"
- "and %%edx, %%ebx \n\t"
- "add %%ebx, %%eax \n\t"
- "test %%ax, %%ax \n\t"
- " jnz 1f \n\t"
- "movl "BYTE "(%1), %%"REG_b" \n\t"
- "subl $0xFFFF, %%eax \n\t"
- "movzwl (%%"REG_b"), %%ecx \n\t"
- "bswap %%ecx \n\t"
- "shrl $15, %%ecx \n\t"
- "addl $2, %%"REG_b" \n\t"
- "addl %%ecx, %%eax \n\t"
- "movl %%"REG_b", "BYTE "(%1) \n\t"
- "1: \n\t"
- "movl %%eax, "LOW "(%1) \n\t"
-
- :"=&d"(bit)
- :"r"(c)
- : "%eax", "%"REG_b, "%ecx", "memory"
- );
- return bit+1;
-#else
int range;
c->low += c->low;
@@ -632,7 +550,6 @@ static int av_unused get_cabac_bypass(CABACContext *c){
c->low -= range;
return 1;
}
-#endif
}
diff --git a/libavcodec/celp_filters.c b/libavcodec/celp_filters.c
index 32eaff3..06d3a75 100644
--- a/libavcodec/celp_filters.c
+++ b/libavcodec/celp_filters.c
@@ -85,14 +85,6 @@ void ff_celp_lp_synthesis_filterf(float *out, const float
*filter_coeffs,
int filter_length)
{
int i,n;
-
-#if 0 // Unoptimized code path for improved readability
- for (n = 0; n < buffer_length; n++) {
- out[n] = in[n];
- for (i = 1; i <= filter_length; i++)
- out[n] -= filter_coeffs[i-1] * out[n-i];
- }
-#else
float out0, out1, out2, out3;
float old_out0, old_out1, old_out2, old_out3;
float a,b,c;
@@ -193,7 +185,6 @@ void ff_celp_lp_synthesis_filterf(float *out, const float
*filter_coeffs,
for (i = 1; i <= filter_length; i++)
out[n] -= filter_coeffs[i-1] * out[n-i];
}
-#endif
}
void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs,
diff --git a/libavcodec/dwt.c b/libavcodec/dwt.c
index 2c5b56c..497c4fb 100644
--- a/libavcodec/dwt.c
+++ b/libavcodec/dwt.c
@@ -511,13 +511,6 @@ void ff_snow_horizontal_compose97i(IDWTELEM *b, int width){
IDWTELEM temp[width];
const int w2= (width+1)>>1;
-#if 0 //maybe more understadable but slower
- inv_lift (temp , b , b +w2, 2, 1, 1, width, W_DM, W_DO, W_DS, 0,
1);
- inv_lift (temp+1 , b +w2, temp , 2, 1, 2, width, W_CM, W_CO, W_CS, 1,
1);
-
- inv_liftS(b , temp , temp+1 , 2, 2, 2, width, W_BM, W_BO, W_BS, 0,
1);
- inv_lift (b+1 , temp+1 , b , 2, 2, 2, width, W_AM, W_AO, W_AS, 1,
0);
-#else
int x;
temp[0] = b[0] - ((3*b[w2]+2)>>2);
for(x=1; x<(width>>1); x++){
@@ -540,7 +533,6 @@ void ff_snow_horizontal_compose97i(IDWTELEM *b, int width){
b[x-1] = temp[x-1] + ((3*(b [x-2] + b [x ] ))>>1);
}else
b[x-1] = temp[x-1] + 3*b [x-2];
-#endif
}
static void vertical_compose97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
int width){
diff --git a/libavcodec/sh4/idct_sh4.c b/libavcodec/sh4/idct_sh4.c
index 0758cd9..0baff39 100644
--- a/libavcodec/sh4/idct_sh4.c
+++ b/libavcodec/sh4/idct_sh4.c
@@ -54,8 +54,6 @@ static const float odd_table[] __attribute__ ((aligned(8))) =
{
#undef c6
#undef c7
-#if 1
-
#define load_matrix(table) \
do { \
const float *t = table; \
@@ -84,52 +82,11 @@ static const float odd_table[] __attribute__ ((aligned(8)))
= {
register float fr2 __asm__("fr2"); \
register float fr3 __asm__("fr3")
-#else
-
-/* generic C code for check */
-
-static void ftrv_(const float xf[],float fv[])
-{
- float f0,f1,f2,f3;
- f0 = fv[0];
- f1 = fv[1];
- f2 = fv[2];
- f3 = fv[3];
- fv[0] = xf[0]*f0 + xf[4]*f1 + xf[ 8]*f2 + xf[12]*f3;
- fv[1] = xf[1]*f0 + xf[5]*f1 + xf[ 9]*f2 + xf[13]*f3;
- fv[2] = xf[2]*f0 + xf[6]*f1 + xf[10]*f2 + xf[14]*f3;
- fv[3] = xf[3]*f0 + xf[7]*f1 + xf[11]*f2 + xf[15]*f3;
-}
-
-static void load_matrix_(float xf[],const float table[])
-{
- int i;
- for(i=0;i<16;i++) xf[i]=table[i];
-}
-
-#define ftrv() ftrv_(xf,fv)
-#define load_matrix(table) load_matrix_(xf,table)
-
-#define DEFREG \
- float fv[4],xf[16]
-
-#define fr0 fv[0]
-#define fr1 fv[1]
-#define fr2 fv[2]
-#define fr3 fv[3]
-
-#endif
-
-#if 1
#define DESCALE(x,n) (x)*(1.0f/(1<<(n)))
-#else
-#define DESCALE(x,n) (((int)(x)+(1<<(n-1)))>>(n))
-#endif
/* this code work worse on gcc cvs. 3.2.3 work fine */
-#if 1
//optimized
void idct_sh4(DCTELEM *block)
@@ -252,111 +209,3 @@ void idct_sh4(DCTELEM *block)
fp_single_leave(fpscr);
}
-#else
-void idct_sh4(DCTELEM *block)
-{
- DEFREG;
-
- int i;
- float tblock[8*8],*fblock;
-
- /* row */
-
- /* even part */
- load_matrix(even_table);
-
- fblock = tblock;
- i = 8;
- do {
- fr0 = block[0];
- fr1 = block[2];
- fr2 = block[4];
- fr3 = block[6];
- block+=8;
- ftrv();
- fblock[0] = fr0;
- fblock[2] = fr1;
- fblock[4] = fr2;
- fblock[6] = fr3;
- fblock+=8;
- } while(--i);
- block-=8*8;
- fblock-=8*8;
-
- load_matrix(odd_table);
-
- i = 8;
-
- do {
- float t0,t1,t2,t3;
- fr0 = block[1];
- fr1 = block[3];
- fr2 = block[5];
- fr3 = block[7];
- block+=8;
- ftrv();
- t0 = fblock[0];
- t1 = fblock[2];
- t2 = fblock[4];
- t3 = fblock[6];
- fblock[0] = t0 + fr0;
- fblock[7] = t0 - fr0;
- fblock[1] = t1 + fr1;
- fblock[6] = t1 - fr1;
- fblock[2] = t2 + fr2;
- fblock[5] = t2 - fr2;
- fblock[3] = t3 + fr3;
- fblock[4] = t3 - fr3;
- fblock+=8;
- } while(--i);
- block-=8*8;
- fblock-=8*8;
-
- /* col */
-
- /* even part */
- load_matrix(even_table);
-
- i = 8;
-
- do {
- fr0 = fblock[8*0];
- fr1 = fblock[8*2];
- fr2 = fblock[8*4];
- fr3 = fblock[8*6];
- ftrv();
- fblock[8*0] = fr0;
- fblock[8*2] = fr1;
- fblock[8*4] = fr2;
- fblock[8*6] = fr3;
- fblock++;
- } while(--i);
- fblock-=8;
-
- load_matrix(odd_table);
-
- i=8;
- do {
- float t0,t1,t2,t3;
- fr0 = fblock[8*1];
- fr1 = fblock[8*3];
- fr2 = fblock[8*5];
- fr3 = fblock[8*7];
- ftrv();
- t0 = fblock[8*0];
- t1 = fblock[8*2];
- t2 = fblock[8*4];
- t3 = fblock[8*6];
- fblock++;
- block[8*0] = DESCALE(t0 + fr0,3);
- block[8*7] = DESCALE(t0 - fr0,3);
- block[8*1] = DESCALE(t1 + fr1,3);
- block[8*6] = DESCALE(t1 - fr1,3);
- block[8*2] = DESCALE(t2 + fr2,3);
- block[8*5] = DESCALE(t2 - fr2,3);
- block[8*3] = DESCALE(t3 + fr3,3);
- block[8*4] = DESCALE(t3 - fr3,3);
- block++;
- } while(--i);
-}
-#endif
diff --git a/libavcodec/x86/idct_mmx.c b/libavcodec/x86/idct_mmx.c
index 57fa818..fb31598 100644
--- a/libavcodec/x86/idct_mmx.c
+++ b/libavcodec/x86/idct_mmx.c
@@ -33,46 +33,6 @@
#define rounder(bias) {round (bias), round (bias)}
-#if 0
-/* C row IDCT - it is just here to document the MMXEXT and MMX versions */
-static inline void idct_row (int16_t * row, int offset,
- int16_t * table, int32_t * rounder)
-{
- int C1, C2, C3, C4, C5, C6, C7;
- int a0, a1, a2, a3, b0, b1, b2, b3;
-
- row += offset;
-
- C1 = table[1];
- C2 = table[2];
- C3 = table[3];
- C4 = table[4];
- C5 = table[5];
- C6 = table[6];
- C7 = table[7];
-
- a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
- a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
- a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
- a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
-
- b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
- b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
- b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
- b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
-
- row[0] = (a0 + b0) >> ROW_SHIFT;
- row[1] = (a1 + b1) >> ROW_SHIFT;
- row[2] = (a2 + b2) >> ROW_SHIFT;
- row[3] = (a3 + b3) >> ROW_SHIFT;
- row[4] = (a3 - b3) >> ROW_SHIFT;
- row[5] = (a2 - b2) >> ROW_SHIFT;
- row[6] = (a1 - b1) >> ROW_SHIFT;
- row[7] = (a0 - b0) >> ROW_SHIFT;
-}
-#endif
-
-
/* MMXEXT row IDCT */
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
@@ -342,78 +302,6 @@ static inline void mmx_row_mid (int16_t * const row, const
int store,
}
-#if 0
-/* C column IDCT - it is just here to document the MMXEXT and MMX versions */
-static inline void idct_col (int16_t * col, int offset)
-{
-/* multiplication - as implemented on mmx */
-#define F(c,x) (((c) * (x)) >> 16)
-
-/* saturation - it helps us handle torture test cases */
-#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
-
- int16_t x0, x1, x2, x3, x4, x5, x6, x7;
- int16_t y0, y1, y2, y3, y4, y5, y6, y7;
- int16_t a0, a1, a2, a3, b0, b1, b2, b3;
- int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
-
- col += offset;
-
- x0 = col[0*8];
- x1 = col[1*8];
- x2 = col[2*8];
- x3 = col[3*8];
- x4 = col[4*8];
- x5 = col[5*8];
- x6 = col[6*8];
- x7 = col[7*8];
-
- u04 = S (x0 + x4);
- v04 = S (x0 - x4);
- u26 = S (F (T2, x6) + x2);
- v26 = S (F (T2, x2) - x6);
-
- a0 = S (u04 + u26);
- a1 = S (v04 + v26);
- a2 = S (v04 - v26);
- a3 = S (u04 - u26);
-
- u17 = S (F (T1, x7) + x1);
- v17 = S (F (T1, x1) - x7);
- u35 = S (F (T3, x5) + x3);
- v35 = S (F (T3, x3) - x5);
-
- b0 = S (u17 + u35);
- b3 = S (v17 - v35);
- u12 = S (u17 - u35);
- v12 = S (v17 + v35);
- u12 = S (2 * F (C4, u12));
- v12 = S (2 * F (C4, v12));
- b1 = S (u12 + v12);
- b2 = S (u12 - v12);
-
- y0 = S (a0 + b0) >> COL_SHIFT;
- y1 = S (a1 + b1) >> COL_SHIFT;
- y2 = S (a2 + b2) >> COL_SHIFT;
- y3 = S (a3 + b3) >> COL_SHIFT;
-
- y4 = S (a3 - b3) >> COL_SHIFT;
- y5 = S (a2 - b2) >> COL_SHIFT;
- y6 = S (a1 - b1) >> COL_SHIFT;
- y7 = S (a0 - b0) >> COL_SHIFT;
-
- col[0*8] = y0;
- col[1*8] = y1;
- col[2*8] = y2;
- col[3*8] = y3;
- col[4*8] = y4;
- col[5*8] = y5;
- col[6*8] = y6;
- col[7*8] = y7;
-}
-#endif
-
-
/* MMX column IDCT */
static inline void idct_col (int16_t * const col, const int offset)
{
diff --git a/libavcodec/x86/mpegvideo_mmx_template.c
b/libavcodec/x86/mpegvideo_mmx_template.c
index ddda07a..0f01cb2 100644
--- a/libavcodec/x86/mpegvideo_mmx_template.c
+++ b/libavcodec/x86/mpegvideo_mmx_template.c
@@ -116,22 +116,11 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
q = s->c_dc_scale;
/* note: block[0] is assumed to be positive */
if (!s->h263_aic) {
-#if 1
__asm__ volatile (
"mul %%ecx \n\t"
: "=d" (level), "=a"(dummy)
: "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
);
-#else
- __asm__ volatile (
- "xorl %%edx, %%edx \n\t"
- "divw %%cx \n\t"
- "movzwl %%ax, %%eax \n\t"
- : "=a" (level)
- : "a" ((block[0]>>2) + q), "c" (q<<1)
- : "%edx"
- );
-#endif
} else
/* For AIC we skip quant/dequant of INTRADC */
level = (block[0] + 4)>>3;
diff --git a/libavcodec/x86/simple_idct_mmx.c b/libavcodec/x86/simple_idct_mmx.c
index 8a90785..5d36496 100644
--- a/libavcodec/x86/simple_idct_mmx.c
+++ b/libavcodec/x86/simple_idct_mmx.c
@@ -86,259 +86,6 @@ static inline void idct(int16_t *block)
int16_t * const temp= (int16_t*)align_tmp;
__asm__ volatile(
-#if 0 //Alternative, simpler variant
-
-#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- #rounder ", %%mm4 \n\t"\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- #rounder ", %%mm0 \n\t"\
- "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
- "paddd %%mm0, %%mm0 \n\t" \
- "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0
*/\
- "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1
*/\
- "movq %%mm7, " #dst " \n\t"\
- "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "movq %%mm2, 24+" #dst " \n\t"\
- "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2
*/\
- "movq %%mm2, 8+" #dst " \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3
*/\
- "movq %%mm4, 16+" #dst " \n\t"\
-
-#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
- "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
- "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm0 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "movd %%mm7, " #dst " \n\t"\
- "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
- "movd %%mm0, 16+" #dst " \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "movd %%mm2, 96+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "movd %%mm4, 112+" #dst " \n\t"\
- "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm5 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "movd %%mm2, 32+" #dst " \n\t"\
- "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
- "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
- "movd %%mm6, 48+" #dst " \n\t"\
- "movd %%mm4, 64+" #dst " \n\t"\
- "movd %%mm5, 80+" #dst " \n\t"\
-
-
-#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
- "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
- "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
- "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
- "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
- "movq "MANGLE(wm1010)", %%mm4 \n\t"\
- "pand %%mm0, %%mm4 \n\t"\
- "por %%mm1, %%mm4 \n\t"\
- "por %%mm2, %%mm4 \n\t"\
- "por %%mm3, %%mm4 \n\t"\
- "packssdw %%mm4,%%mm4 \n\t"\
- "movd %%mm4, %%eax \n\t"\
- "orl %%eax, %%eax \n\t"\
- "jz 1f \n\t"\
- "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
- "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
- "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
- "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
- "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
- "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
- "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
- "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
- "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
- #rounder ", %%mm4 \n\t"\
- "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
- "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
- "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
- "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
- #rounder ", %%mm0 \n\t"\
- "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
- "paddd %%mm0, %%mm0 \n\t" \
- "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
- "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
- "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
- "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
- "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
- "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
- "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
- "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
- "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
- "psrad $" #shift ", %%mm7 \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
- "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
- "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
- "psrad $" #shift ", %%mm1 \n\t"\
- "psrad $" #shift ", %%mm2 \n\t"\
- "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0
*/\
- "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1
*/\
- "movq %%mm7, " #dst " \n\t"\
- "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
- "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
- "movq %%mm2, 24+" #dst " \n\t"\
- "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
- "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
- "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
- "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
- "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
- "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
- "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
- "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
- "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
- "psrad $" #shift ", %%mm2 \n\t"\
- "psrad $" #shift ", %%mm0 \n\t"\
- "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
- "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
- "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
- "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
- "psrad $" #shift ", %%mm6 \n\t"\
- "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2
*/\
- "movq %%mm2, 8+" #dst " \n\t"\
- "psrad $" #shift ", %%mm4 \n\t"\
- "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3
*/\
- "movq %%mm4, 16+" #dst " \n\t"\
- "jmp 2f \n\t"\
- "1: \n\t"\
- "pslld $16, %%mm0 \n\t"\
- "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
- "psrad $13, %%mm0 \n\t"\
- "packssdw %%mm0, %%mm0 \n\t"\
- "movq %%mm0, " #dst " \n\t"\
- "movq %%mm0, 8+" #dst " \n\t"\
- "movq %%mm0, 16+" #dst " \n\t"\
- "movq %%mm0, 24+" #dst " \n\t"\
- "2: \n\t"
-
-
-//IDCT( src0, src4, src1, src5, dst, rounder, shift)
-ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
-/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
-ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
-ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
-
-DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
-DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
-DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
-
-
-//IDCT( src0, src4, src1, src5, dst, shift)
-COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
-COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
-COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
-COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-
-#else
-
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
@@ -1117,9 +864,6 @@ IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
-
-#endif
-
/*
Input
00 40 04 44 20 60 24 64
--
1.7.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel