On 22/03/15 12:49 PM, Vittorio Giovara wrote:
> +// AAN IDCT
If this isn't already in the tree somewhere and it's generic enough that it can
be reused,
then it should be shared like faanidct and added to idctdsp.
And if it's HQ/HQA specific, it still could be split into a new hqdsp context
for potential
optimizations.
> +
> +#define FIX_1_082 17734
> +#define FIX_1_847 30274
> +#define FIX_1_414 23170
> +#define FIX_2_613 21407 // divided by two to fit the range
> +
> +#define IDCTMUL(a, b) ((a) * (b) >> 16)
> +
> +static inline void idct_row(int16_t *blk)
> +{
> + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmpA;
> + int tmpB, tmpC, tmpD, tmpE, tmpF, tmp10, tmp11, tmp12, tmp13, tmp14;
> +
> + tmp0 = blk[5] - blk[3];
> + tmp1 = blk[5] + blk[3];
> + tmp2 = blk[1] - blk[7];
> + tmp3 = blk[1] + blk[7];
> + tmp4 = tmp3 - tmp1;
> + tmp5 = IDCTMUL(tmp0 + tmp2, FIX_1_847);
> + tmp6 = IDCTMUL(tmp2, FIX_1_082) - tmp5;
> + tmp7 = tmp5 - IDCTMUL(tmp0, FIX_2_613) * 2;
> + tmp8 = tmp3 + tmp1;
> + tmp9 = tmp7 * 4 - tmp8;
> + tmpA = IDCTMUL(tmp4, FIX_1_414) * 4 - tmp9;
> + tmpB = tmp6 * 4 + tmpA;
> + tmpC = blk[2] + blk[6];
> + tmpD = blk[2] - blk[6];
> + tmpE = blk[0] - blk[4];
> + tmpF = blk[0] + blk[4];
> +
> + tmp10 = IDCTMUL(tmpD, FIX_1_414) * 4 - tmpC;
> + tmp11 = tmpE - tmp10;
> + tmp12 = tmpF - tmpC;
> + tmp13 = tmpE + tmp10;
> + tmp14 = tmpF + tmpC;
> +
> + blk[0] = tmp14 + tmp8;
> + blk[1] = tmp13 + tmp9;
> + blk[2] = tmp11 + tmpA;
> + blk[3] = tmp12 - tmpB;
> + blk[4] = tmp12 + tmpB;
> + blk[5] = tmp11 - tmpA;
> + blk[6] = tmp13 - tmp9;
> + blk[7] = tmp14 - tmp8;
> +}
> +
> +static inline void idct_col(int16_t *blk)
> +{
> + int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmpA;
> + int tmpB, tmpC, tmpD, tmpE, tmpF, tmp10, tmp11, tmp12, tmp13, tmp14;
> +
> + tmp0 = blk[5 * 8] - blk[3 * 8];
> + tmp1 = blk[5 * 8] + blk[3 * 8];
> + tmp2 = blk[1 * 8] * 2 - (blk[7 * 8] >> 2);
> + tmp3 = blk[1 * 8] * 2 + (blk[7 * 8] >> 2);
> + tmp4 = tmp3 - tmp1;
> + tmp5 = IDCTMUL(tmp0 + tmp2, FIX_1_847);
> + tmp6 = IDCTMUL(tmp2, FIX_1_082) - tmp5;
> + tmp7 = tmp5 - IDCTMUL(tmp0, FIX_2_613) * 2;
> + tmp8 = (tmp3 + tmp1) >> 1;
> + tmp9 = tmp7 * 2 - tmp8;
> + tmpA = IDCTMUL(tmp4, FIX_1_414) * 2 - tmp9;
> + tmpB = tmp6 * 2 + tmpA;
> + tmpC = blk[2 * 8] + (blk[6 * 8] >> 1) >> 1;
> + tmpD = blk[2 * 8] - (blk[6 * 8] >> 1);
> + tmpE = (blk[0 * 8] >> 1) - (blk[4 * 8] >> 1) + 0x2020;
> + tmpF = (blk[0 * 8] >> 1) + (blk[4 * 8] >> 1) + 0x2020;
> +
> + tmp10 = IDCTMUL(tmpD, FIX_1_414) * 2 - tmpC;
> + tmp11 = tmpE - tmp10;
> + tmp12 = tmpF - tmpC;
> + tmp13 = tmpE + tmp10;
> + tmp14 = tmpF + tmpC;
> +
> + blk[0 * 8] = (tmp14 + tmp8) >> 6;
> + blk[1 * 8] = (tmp13 + tmp9) >> 6;
> + blk[2 * 8] = (tmp11 + tmpA) >> 6;
> + blk[3 * 8] = (tmp12 - tmpB) >> 6;
> + blk[4 * 8] = (tmp12 + tmpB) >> 6;
> + blk[5 * 8] = (tmp11 - tmpA) >> 6;
> + blk[6 * 8] = (tmp13 - tmp9) >> 6;
> + blk[7 * 8] = (tmp14 - tmp8) >> 6;
> +}
> +
> +static void hq_idct_put(uint8_t *dst, int stride, int16_t *block)
> +{
> + int i, j;
> +
> + for (i = 0; i < 8; i++)
> + idct_row(block + i * 8);
> + for (i = 0; i < 8; i++)
> + idct_col(block + i);
> +
> + // or use IDCTDSPContext.put_pixels_clamped()
Bench and see if it's worth using? There's an optimized version for most
platforms after
all.
> + for (i = 0; i < 8; i++) {
> + for (j = 0; j < 8; j++)
> + dst[j] = av_clip_uint8(block[j + i * 8]);
> + dst += stride;
> + }
> +}
> +
> +static inline void put_blocks(HQContext *c, AVFrame *pic,
> + int plane, int x, int y, int ilace,
> + int16_t *block0, int16_t *block1)
> +{
> + uint8_t *p = pic->data[plane] + x;
> +
> + hq_idct_put(p + y * pic->linesize[plane],
> + pic->linesize[plane] << ilace, block0);
> + hq_idct_put(p + (y + (ilace ? 1 : 8)) * pic->linesize[plane],
> + pic->linesize[plane] << ilace, block1);
> +}
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel