On Sun, Dec 14, 2014 at 12:26:15PM +0530, arwa arif wrote: > I have tried to port fspp. Not sure, if it is correct or not.
[...] > +static void filter(FSPPContext *p , uint8_t *dst , uint8_t *src, > + int dst_stride , int src_stride , > + int width , int height , > + uint8_t *qp_store , int qp_stride , int is_luma) { > + > + int x, x0, y, es, qy, t; > + const int stride = is_luma ? p->temp_stride : (width+16); > //((width+16+15)&(~15)) > + const int step = 6 - p->log2_count; > + const int qps = 3 + is_luma; > + DECLARE_ALIGNED(32 , int32_t , block_align)[4 * 8 * BLOCKSZ + 4 * 8 * > BLOCKSZ]; > + int16_t *block = (int16_t *)block_align; > + int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ); > + > + memset(block3 , 0 , 4 * 8 * BLOCKSZ); > + > + //p->src=src-src_stride*8-8;//! > + if (!src || !dst) return; // HACK avoid crash for Y8 colourspace > + for (y = 0 ; y < height ; y++) { > + int index = 8 + 8*stride + y*stride; > + memcpy(p->src + index , src + y*src_stride , width);//this line can > be avoided by using DR & user fr.buffers > + for (x = 0 ; x < 8 ; x++) { > + p->src[index - x - 1]= p->src[index + x ]; > + p->src[index + width + x ]= p->src[index + width - x - 1]; > + } > + } > + for (y = 0 ; y < 8 ; y++) { > + memcpy(p->src + ( 7 - y ) * stride , p->src + ( y + 8 ) > * stride , stride); > + memcpy(p->src + (height + 8 + y) * stride , p->src + (height -y + 7) > * stride , stride); > + } > + //FIXME (try edge emu) > + > + for (y = 8 ; y < 24 ; y++) > + memset(p->temp + 8 + y * stride , 0 , width * sizeof(int16_t)); > + > + for (y = step ; y < height + 8 ; y += step) { //step= 1,2 > + qy = y - 4; > + if (qy > height - 1) qy = height - 1; > + if (qy < 0) qy = 0; > + qy = (qy >> qps) * qp_stride; > + row_fdct_s(block , p->src + y * stride + 2 - (y&1) , stride , 2); > + for (x0 = 0 ; x0 < width + 8 - 8 * (BLOCKSZ - 1) ; x0 += 8 * > (BLOCKSZ - 1)) { > + row_fdct_s(block + 8 * 8 , p->src + y * stride + 8 + x0 + 2 - > (y&1) , stride , 2 * (BLOCKSZ - 1)); > + if(p->qp) > + column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block + 0 * > 8 , block3 + 0 * 8 , 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT > + else > + for (x = 0 ; x < 8 * (BLOCKSZ - 1) ; x += 8) { > + t = x + x0 -2; //correct t=x+x0-2-(y&1), but its the same > + if (t<0) t = 0;//t always < width-2 > + t = qp_store[qy+(t >> qps)]; > + t = norm_qscale(t, p->qscale_type); > + if (t != p->prev_q) p->prev_q = t, mul_thrmat_s(p, t); > + column_fidct_s((int16_t*)(&p->threshold_mtx[0]) , block > + x * 8 , block3 + x * 8 , 8); //yes, this is a HOTSPOT > + } > + row_idct_s(block3 + 0*8 , p->temp + (y&15) * stride + x0 + 2 - > (y&1) , stride , 2 * (BLOCKSZ - 1)); > + memmove(block, block + (BLOCKSZ - 1) * 64 , 8 * 8 * > sizeof(int16_t)); //cycling > + memmove(block3, block3 + (BLOCKSZ - 1) * 64 , 6 * 8 * > sizeof(int16_t)); > + } > + // > + es = width + 8 - x0; // 8, ... > + if (es > 8) > + row_fdct_s(block + 8 * 8 , p->src + y * stride + 8 + x0 + 2 - > (y&1) , stride , (es - 4) >> 2); > + column_fidct_s((int16_t*)(&p->threshold_mtx[0]) , block , block3 , > es&(~1)); > + row_idct_s(block3 + 0 * 8 , p->temp + (y&15) * stride + x0 + 2 - > (y&1) , stride , es >> 2); > + const int y1 = y - 8 + step;//l5-7 l4-6 this mixes declaration and statents, some compilers have problems with that [...] > +static void mul_thrmat_mmx(FSPPContext *p, int q) { > + uint64_t *adr = &p->threshold_mtx_noq[0]; > + __asm__ volatile( > + "movd %0, %%mm7 \n\t" > + "add $8*8*2, %%"REG_D" \n\t" > + "movq 0*8(%%"REG_S"), %%mm0 \n\t" > + "punpcklwd %%mm7, %%mm7 \n\t" > + "movq 1*8(%%"REG_S"), %%mm1 \n\t" > + "punpckldq %%mm7, %%mm7 \n\t" > + "pmullw %%mm7, %%mm0 \n\t" > + > + "movq 2*8(%%"REG_S"), %%mm2 \n\t" > + "pmullw %%mm7, %%mm1 \n\t" > + > + "movq 3*8(%%"REG_S"), %%mm3 \n\t" > + "pmullw %%mm7, %%mm2 \n\t" > + > + "movq %%mm0, 0*8(%%"REG_D") \n\t" > + "movq 4*8(%%"REG_S"), %%mm4 \n\t" > + "pmullw %%mm7, %%mm3 \n\t" > + > + "movq %%mm1, 1*8(%%"REG_D") \n\t" > + "movq 5*8(%%"REG_S"), %%mm5 \n\t" > + "pmullw %%mm7, %%mm4 \n\t" > + > + "movq %%mm2, 2*8(%%"REG_D") \n\t" > + "movq 6*8(%%"REG_S"), %%mm6 \n\t" > + "pmullw %%mm7, %%mm5 \n\t" > + > + "movq %%mm3, 3*8(%%"REG_D") \n\t" > + "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t" > + "pmullw %%mm7, %%mm6 \n\t" > + > + "movq %%mm4, 4*8(%%"REG_D") \n\t" > + "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t" > + "pmullw %%mm7, %%mm0 \n\t" > + > + "movq %%mm5, 5*8(%%"REG_D") \n\t" > + "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t" > + "pmullw %%mm7, %%mm1 \n\t" > + > + "movq %%mm6, 6*8(%%"REG_D") \n\t" > + "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t" > + "pmullw %%mm7, %%mm2 \n\t" > + > + "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t" > + "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t" > + "pmullw %%mm7, %%mm3 \n\t" > + > + "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t" > + "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t" > + "pmullw %%mm7, %%mm4 \n\t" > + > + "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t" > + "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t" > + "pmullw %%mm7, %%mm5 \n\t" > + > + "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t" > + "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t" > + "pmullw %%mm7, %%mm6 \n\t" > + > + "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t" > + "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t" > + "pmullw %%mm7, %%mm0 \n\t" > + > + "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t" > + "pmullw %%mm7, %%mm1 \n\t" > + > + "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t" > + "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t" > + "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t" > + > + : "+g" (q), "+S" (adr), "+D" (adr) > + : > + ); > +} > + > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433) = FIX64(0.382683433, > 14); > +DECLARE_ALIGNED (8, uint64_t, ff_MM_FIX_0_541196100)= FIX64(0.541196100, > 14); > +DECLARE_ALIGNED (8, uint64_t, ff_MM_FIX_0_707106781)= FIX64(0.707106781, > 14); these 2 conflict with the existing fspp filter, they should be removed from one to avoid that conflict > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965) = FIX64(1.306562965, > 14); > + > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A) = FIX64(1.414213562, > 14); > + > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065) = FIX64(1.847759065, > 13); > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930) = FIX64(-2.613125930, > 13); //- > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562) = FIX64(1.414213562, > 13); > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200) = FIX64(1.082392200, > 13); > +//for t3,t5,t7 == 0 shortcut > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065) = FIX64(0.847759065, > 14); > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497) = FIX64(0.566454497, > 14); > +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367) = FIX64(0.198912367, > 14); > + > +DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND) = C64(4); > +DECLARE_ASM_CONST(8, uint64_t, MM_2) = C64(2); > + > +static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t > *output, int cnt) > +{ > + DECLARE_ALIGNED(8, uint64_t, temps)[4]; > + __asm__ volatile( > + ASMALIGN(4) this fails to build, you can remove that line [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB I know you won't believe me, but the highest form of Human Excellence is to question oneself and others. -- Socrates
signature.asc
Description: Digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel