Praveen, Can you build a mercurial queue for these quant patches - so they can be reviewed and pushed in once quant is 16-bit everywhere?
Thanks, Deepthi On Thu, Aug 14, 2014 at 2:01 AM, Steve Borho <st...@borho.org> wrote: > On 08/12, prav...@multicorewareinc.com wrote: > > # HG changeset patch > > # User Praveen Tiwari > > # Date 1407834530 -19800 > > # Node ID bb4d44663964237e4b66af6d92b2f13dbcf4f9b9 > > # Parent 8a7f4bb1d1be32fe668d410450c2e320ccae6098 > > count_nonzero primitive, downscaling quantCoeff from int32_t* to int16_t* > > There's not much point in applying these patches until all of the quant > primitives are using short ints for coefficients. As-is this will just > be a slow-down. > > > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/dct.cpp > > --- a/source/common/dct.cpp Tue Aug 12 01:11:39 2014 -0500 > > +++ b/source/common/dct.cpp Tue Aug 12 14:38:50 2014 +0530 > > @@ -815,7 +815,7 @@ > > return numSig; > > } > > > > -int count_nonzero_c(const int32_t *quantCoeff, int numCoeff) > > +int count_nonzero_c(const int16_t *quantCoeff, int numCoeff) > > { > > X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not > aligned\n"); > > X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid > %d\n", numCoeff); > > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/primitives.h > > --- a/source/common/primitives.h Tue Aug 12 01:11:39 2014 -0500 > > +++ b/source/common/primitives.h Tue Aug 12 14:38:50 2014 +0530 > > @@ -163,7 +163,7 @@ > > typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, > int32_t *qCoef, int qBits, int add, int numCoeff); > > typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t > *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift); > > typedef void (*dequant_normal_t)(const int32_t* quantCoef, int32_t* > coef, int num, int scale, int shift); > > -typedef int (*count_nonzero_t)(const int32_t *quantCoeff, int > numCoeff); > > +typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int > numCoeff); > > > > typedef void (*weightp_pp_t)(pixel *src, pixel *dst, intptr_t > srcStride, intptr_t dstStride, int width, int height, int w0, int round, > int shift, int offset); > > typedef void (*weightp_sp_t)(int16_t *src, pixel *dst, intptr_t > srcStride, intptr_t dstStride, int width, int height, int w0, int round, > int shift, int offset); > > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/quant.cpp > > --- a/source/common/quant.cpp Tue Aug 12 01:11:39 2014 -0500 > > +++ b/source/common/quant.cpp Tue Aug 12 14:38:50 2014 +0530 > > @@ -2,6 +2,7 @@ > > * Copyright (C) 2014 x265 project > > * > > * Authors: Steve Borho <st...@borho.org> > > + * Praveen Kumar Tiwari <prav...@multicorewareinc.com> > > * > > * This program is free software; you can redistribute it and/or modify > > * it under the terms of the GNU General Public License as published by > > @@ -463,7 +464,17 @@ > > const uint32_t sizeIdx = log2TrSize - 2; > > int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra; > > > > - X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << > log2TrSize * 2), "numSig differ\n"); > > + /* This section of code is to safely convert int32_t > coefficients to int16_t, once the caller function is > > + * optimize to take coefficients as int16_t*, it will be > cleanse.*/ > > + int numCoeff = (1 << (log2TrSize * 2)); > > + assert(numCoeff <= 1024); > > + ALIGN_VAR_16(int16_t, qCoeff[32 * 32]); > > + for (int i = 0; i < numCoeff; i++) > > + { > > + qCoeff[i] = (coeff[i] & 0xFFFF); > > + } > > + > > + X265_CHECK((int)numSig == primitives.count_nonzero(qCoeff, 1 << > log2TrSize * 2), "numSig differ\n"); > > > > // DC only > > if (numSig == 1 && coeff[0] != 0 && !useDST) > > @@ -501,7 +512,16 @@ > > int numCoeff = 1 << log2TrSize * 2; > > uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, > dstCoeff, qbits, add, numCoeff); > > These two loops are only here for an X265_CHECK statement that is > usually compiled out. All of this code should have been wrapped within > #if CHECKED_BUILD || _DEBUG > > > - X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, > numCoeff), "numSig differ\n"); > > + /* This section of code is to safely convert int32_t coefficients > to int16_t, once the caller function is > > + * optimize to take coefficients as int16_t*, it will be cleanse.*/ > > + assert(numCoeff <= 1024); > > + ALIGN_VAR_16(int16_t, qCoeff[32 * 32]); > > + for (int i = 0; i < numCoeff; i++) > > + { > > + qCoeff[i] = (dstCoeff[i] & 0xFFFF); > > + } > > + > > + X265_CHECK((int)numSig == primitives.count_nonzero(qCoeff, > numCoeff), "numSig differ\n"); > > if (!numSig) > > return 0; > > > > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/x86/pixel-util.h > > --- a/source/common/x86/pixel-util.h Tue Aug 12 01:11:39 2014 -0500 > > +++ b/source/common/x86/pixel-util.h Tue Aug 12 14:38:50 2014 +0530 > > @@ -2,6 +2,7 @@ > > * Copyright (C) 2013 x265 project > > * > > * Authors: Steve Borho <st...@borho.org> > > + * Praveen Kumar Tiwari <prav...@multicorewareinc.com> > > * > > * This program is free software; you can redistribute it and/or modify > > * it under the terms of the GNU General Public License as published by > > @@ -47,7 +48,7 @@ > > uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t > *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff); > > uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t > *qCoef, int qBits, int add, int numCoeff); > > void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, > int num, int scale, int shift); > > -int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff); > > +int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff); > > > > void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, > intptr_t dstStride, int width, int height, int w0, int round, int shift, > int offset); > > void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, > intptr_t dstStride, int width, int height, int w0, int round, int shift, > int offset); > > diff -r 8a7f4bb1d1be -r bb4d44663964 source/common/x86/pixel-util8.asm > > --- a/source/common/x86/pixel-util8.asm Tue Aug 12 01:11:39 2014 > -0500 > > +++ b/source/common/x86/pixel-util8.asm Tue Aug 12 14:38:50 2014 > +0530 > > @@ -3,6 +3,7 @@ > > ;* > > ;* Authors: Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com> > > ;* Nabajit Deka <naba...@multicorewareinc.com> > > +;* Praveen Kumar Tiwari <prav...@multicorewareinc.com> > > ;* > > ;* This program is free software; you can redistribute it and/or modify > > ;* it under the terms of the GNU General Public License as published by > > @@ -1091,10 +1092,10 @@ > > > > > > > > ;----------------------------------------------------------------------------- > > -; int count_nonzero(const int32_t *quantCoeff, int numCoeff); > > +; int count_nonzero(const int16_t *quantCoeff, int numCoeff); > > > > ;----------------------------------------------------------------------------- > > INIT_XMM ssse3 > > -cglobal count_nonzero, 2,2,5 > > +cglobal count_nonzero, 2,2,4 > > pxor m0, m0 > > shr r1d, 4 > > movd m1, r1d > > @@ -1103,12 +1104,8 @@ > > .loop: > > mova m2, [r0 + 0] > > mova m3, [r0 + 16] > > - packssdw m2, m3 > > - mova m3, [r0 + 32] > > - mova m4, [r0 + 48] > > - add r0, 64 > > - packssdw m3, m4 > > packsswb m2, m3 > > + add r0, 32 > > pcmpeqb m2, m0 > > paddb m1, m2 > > dec r1d > > diff -r 8a7f4bb1d1be -r bb4d44663964 source/encoder/entropy.cpp > > --- a/source/encoder/entropy.cpp Tue Aug 12 01:11:39 2014 -0500 > > +++ b/source/encoder/entropy.cpp Tue Aug 12 14:38:50 2014 +0530 > > @@ -2,6 +2,7 @@ > > * Copyright (C) 2013 x265 project > > * > > * Authors: Steve Borho <st...@borho.org> > > +* Praveen Kumar Tiwari <prav...@multicorewareinc.com> > > * > > * This program is free software; you can redistribute it and/or modify > > * it under the terms of the GNU General Public License as published by > > @@ -1488,8 +1489,18 @@ > > { > > uint32_t trSize = 1 << log2TrSize; > > > > + /* This section of code is to safely convert int32_t coefficients > to int16_t, once the caller function is > > + * optimize to take coefficients as int16_t*, it will be cleanse.*/ > > + int numCoeff = (1 << (log2TrSize << 1)); > > + assert(numCoeff <= 1024); > > + ALIGN_VAR_16(int16_t, qCoeff[32 * 32]); > > + for (int i = 0; i < numCoeff; i++) > > + { > > + qCoeff[i] = (coeff[i] & 0xFFFF); > > + } > > + > > // compute number of significant coefficients > > - uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize > << 1))); > > + uint32_t numSig = primitives.count_nonzero(qCoeff, (1 << > (log2TrSize << 1))); > > > > X265_CHECK(numSig > 0, "cbf check fail\n"); > > > > diff -r 8a7f4bb1d1be -r bb4d44663964 source/test/mbdstharness.cpp > > --- a/source/test/mbdstharness.cpp Tue Aug 12 01:11:39 2014 -0500 > > +++ b/source/test/mbdstharness.cpp Tue Aug 12 14:38:50 2014 +0530 > > @@ -366,7 +366,7 @@ > > > > bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, > count_nonzero_t opt) > > { > > - ALIGN_VAR_32(int32_t, qcoeff[32 * 32]); > > + ALIGN_VAR_32(int16_t, qcoeff[32 * 32]); > > > > for (int i = 0; i < 4; i++) > > { > > @@ -376,7 +376,7 @@ > > > > for (int n = 0; n <= num; n++) > > { > > - memset(qcoeff, 0, num * sizeof(int32_t)); > > + memset(qcoeff, 0, num * sizeof(int16_t)); > > > > for (int j = 0; j < n; j++) > > { > > @@ -386,7 +386,7 @@ > > k = (k + 11) & mask; > > } > > > > - qcoeff[k] = rand() - RAND_MAX / 2; > > + qcoeff[k] = (int16_t)rand() - RAND_MAX / 2; > > } > > > > int refval = ref(qcoeff, num); > > @@ -516,7 +516,7 @@ > > for (int i = 4; i <= 32; i <<= 1) > > { > > printf("count_nonzero[%dx%d]", i, i); > > - REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, > mbufidct, i * i) > > + REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbuf1, > i * i) > > } > > } > > } > > _______________________________________________ > > x265-devel mailing list > > x265-devel@videolan.org > > https://mailman.videolan.org/listinfo/x265-devel > > -- > Steve Borho > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel