Re: [x265] [PATCH 6 of 6] asm: improve costCoeffRemain by bypass uncoded coeff

Deepthi Nandakumar Wed, 10 Jun 2015 00:33:08 -0700

Min, thanks for this series of patches, all pushed.

On Tue, Jun 9, 2015 at 11:36 PM, Min Chen <[email protected]> wrote:


> # HG changeset patch
> # User Min Chen <[email protected]>
> # Date 1433872879 25200
> # Node ID e5b6f0a984bdd8ab16b63fb1c11a508a444515ec
> # Parent  134670771e0c1dd0800c3e9db0a1f9f69c467e36
> asm: improve costCoeffRemain by bypass uncoded coeff
> ---
>  source/common/dct.cpp             |   17 ++++----
>  source/common/primitives.h        |    2 +-
>  source/common/x86/pixel-util.h    |    2 +-
>  source/common/x86/pixel-util8.asm |   41 +++++++++----------
>  source/encoder/entropy.cpp        |   82
> ++++++++++++++++++++++++++++++++-----
>  5 files changed, 101 insertions(+), 43 deletions(-)
>
> diff -r 134670771e0c -r e5b6f0a984bd source/common/dct.cpp
> --- a/source/common/dct.cpp     Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/common/dct.cpp     Tue Jun 09 11:01:19 2015 -0700
> @@ -874,19 +874,19 @@
>      return (sum & 0xFFFFFF);
>  }
>
> -uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero)
> +uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
>  {
>      uint32_t goRiceParam = 0;
> -    int firstCoeff2 = 1;
> -    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
>
>      uint32_t sum = 0;
> -    int idx = 0;
> +    int baseLevel = 3;
>      do
>      {
> -        int baseLevel = (baseLevelN & 3) | firstCoeff2;
> -        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 +
> firstCoeff2) : 1), "baseLevel check failurr\n");
> -        baseLevelN >>= 2;
> +        if (idx >= C1FLAG_NUMBER)
> +            baseLevel = 1;
> +
> +        // TODO: the IDX is not really idx, so this check inactive
> +        //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 +
> firstCoeff2) : 1), "baseLevel check failurr\n");
>          int codeNumber = absCoeff[idx] - baseLevel;
>
>          if (codeNumber >= 0)
> @@ -912,8 +912,7 @@
>                  goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
>              X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
>          }
> -        if (absCoeff[idx] >= 2)
> -            firstCoeff2 = 0;
> +        baseLevel = 2;
>          idx++;
>      }
>      while(idx < numNonZero);
> diff -r 134670771e0c -r e5b6f0a984bd source/common/primitives.h
> --- a/source/common/primitives.h        Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/common/primitives.h        Tue Jun 09 11:01:19 2015 -0700
> @@ -187,7 +187,7 @@
>  typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const
> intptr_t trSize, const uint16_t scanTbl[16]);
>
>  typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t
> *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx,
> uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int
> subPosBase);
> -typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero);
> +typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero,
> int idx);
>
>  /* Function pointers to optimized encoder primitives. Each pointer can
> reference
>   * either an assembly routine, a SIMD intrinsic primitive, or a C
> function */
> diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h    Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/common/x86/pixel-util.h    Tue Jun 09 11:01:19 2015 -0700
> @@ -83,7 +83,7 @@
>  uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const
> intptr_t trSize, const uint16_t scanTbl[16]);
>
>  uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t
> *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx,
> uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int
> subPosBase);
> -uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero);
> +uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero,
> int idx);
>
>
>  #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
> diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:19 2015 -0700
> @@ -6572,7 +6572,7 @@
>  ;}
>  ;while(idx < numNonZero);
>
> -; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero)
> +; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero, int idx)
>  INIT_XMM sse4
>  cglobal costCoeffRemain, 0,7,1
>      ; assign RCX to R3
> @@ -6580,48 +6580,43 @@
>    %if WIN64
>      DECLARE_REG_TMP 3,1,2,0
>      mov         t0, r0
> +    mov         r4d, r2d
>    %elif ARCH_X86_64
>      ; *nix x64 didn't do anything
>      DECLARE_REG_TMP 0,1,2,3
> +    mov         r4d, r2d
>    %else ; X86_32
>      DECLARE_REG_TMP 6,3,2,1
>      mov         t0, r0m
> +    mov         r4d, r2m
>    %endif
>
> -    mova        m0, [t0]
> -    packsswb    m0, [t0 + mmsize]
> -    pcmpgtb     m0, [pb_1]
> -    pmovmskb    r2d, m0
> -    bsf         r2d, r2d
> -    lea         r2d, [r2 * 2 + 1]
> -    xor         r4d, r4d
> -    bts         r4d, r2d
> -    dec         r4d
> -    and         r4d, 0x55555555
> -    or          r4d, 0x5555AAAA
> -
>      xor         t3d, t3d
>      xor         r5d, r5d
>
> +    lea         t0, [t0 + r4 * 2]
> +    mov         r2d, 3
> +
>      ; register mapping
> -    ; r4d - baseLevelN
> -    ; r2  - tmp
> +    ; r2d - baseLevel & tmp
> +    ; r4d - idx
>      ; t3  - goRiceParam
> -    ; eax - tmp - absCoeff[idx]
> +    ; eax - absCoeff[idx] & tmp
>      ; r5  - sum
>
>  .loop:
> +    mov         eax, 1
> +    cmp         r4d, 8
> +    cmovge      r2d, eax
> +
>      movzx       eax, word [t0]
>      add         t0, 2
> -    mov         r2d, r4d
> -    and         r2d, 3
> -    shr         r4d, 2
>      sub         eax, r2d                ; codeNumber = absCoeff[idx] -
> baseLevel
>      jl         .next
>
>      shr         eax, t3b                ; codeNumber =
> ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION
>
> -    lea         r2d, [eax - 3 + 1]      ; CLZ(cidx, codeNumber + 1);
> +    lea         r2d, [rax - 3 + 1]      ; CLZ(cidx, codeNumber + 1);
>      bsr         r2d, r2d
>      add         r2d, r2d                ; codeNumber = (length + length)
>
> @@ -6644,8 +6639,10 @@
>      add         t3b, al
>
>  .next:
> -    dec   dword r1m
> -    jnz        .loop
> +    inc         r4d
> +    mov         r2d, 2
> +    cmp         r4d, r1m
> +    jl         .loop
>
>      mov         eax, r5d
>      RET
> diff -r 134670771e0c -r e5b6f0a984bd source/encoder/entropy.cpp
> --- a/source/encoder/entropy.cpp        Tue Jun 09 11:01:15 2015 -0700
> +++ b/source/encoder/entropy.cpp        Tue Jun 09 11:01:19 2015 -0700
> @@ -1431,6 +1431,55 @@
>          encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth),
> m_contextState[OFF_QT_CBF_CTX + ctx]);
>  }
>
> +#if CHECKED_BUILD || _DEBUG
> +uint32_t costCoeffRemain_c0(uint16_t *absCoeff, int numNonZero)
> +{
> +    uint32_t goRiceParam = 0;
> +    int firstCoeff2 = 1;
> +    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
> +
> +    uint32_t sum = 0;
> +    int idx = 0;
> +    do
> +    {
> +        int baseLevel = (baseLevelN & 3) | firstCoeff2;
> +        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 +
> firstCoeff2) : 1), "baseLevel check failurr\n");
> +        baseLevelN >>= 2;
> +        int codeNumber = absCoeff[idx] - baseLevel;
> +
> +        if (codeNumber >= 0)
> +        {
> +            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel,
> goRiceParam);
> +            uint32_t length = 0;
> +
> +            codeNumber = ((uint32_t)codeNumber >> goRiceParam) -
> COEF_REMAIN_BIN_REDUCTION;
> +            if (codeNumber >= 0)
> +            {
> +                {
> +                    unsigned long cidx;
> +                    CLZ(cidx, codeNumber + 1);
> +                    length = cidx;
> +                }
> +                X265_CHECK((codeNumber != 0) || (length == 0), "length
> check failure\n");
> +
> +                codeNumber = (length + length);
> +            }
> +            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam +
> codeNumber);
> +
> +            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION <<
> goRiceParam))
> +                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
> +            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
> +        }
> +        if (absCoeff[idx] >= 2)
> +            firstCoeff2 = 0;
> +        idx++;
> +    }
> +    while(idx < numNonZero);
> +
> +    return sum;
> +}
> +#endif // debug only code
> +
>  void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff,
> uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype)
>  {
>      uint32_t trSize = 1 << log2TrSize;
> @@ -1519,7 +1568,7 @@
>      uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX]
> : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
>      uint32_t c1 = 1;
>      int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
> -    ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);
> +    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
>      uint32_t numNonZero = 1;
>      unsigned long lastNZPosInCG;
>      unsigned long firstNZPosInCG;
> @@ -1700,6 +1749,7 @@
>              uint32_t numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER);
>              X265_CHECK(numC1Flag > 0, "numC1Flag check failure\n");
>
> +            uint32_t firstC2Idx = 8;
>              uint32_t firstC2Flag = 2;
>              uint32_t c1Next = 0xFFFFFFFE;
>              if (!m_bitIf)
> @@ -1720,9 +1770,13 @@
>
>                      if (symbol1)
>                          c1Next = 0;
> +
>                      if (symbol1 + firstC2Flag == 3)
>                          firstC2Flag = symbol2;
>
> +                    if (symbol1 + firstC2Idx == 9)
> +                        firstC2Idx  = idx;
> +
>                      c1 = (c1Next & 3);
>                      c1Next >>= 2;
>                      X265_CHECK(c1 <= 3, "c1 check failure\n");
> @@ -1749,9 +1803,10 @@
>                  //encodeBinsEP((coeffSigns >> hiddenShift), numNonZero -
> hiddenShift);
>                  m_fracBits += (numNonZero - hiddenShift) << 15;
>
> -                if (!c1 || numNonZero > C1FLAG_NUMBER)
> +                if (numNonZero > firstC2Idx)
>                  {
> -                    uint32_t sum = primitives.costCoeffRemain(absCoeff,
> numNonZero);
> +                    sum = primitives.costCoeffRemain(absCoeff,
> numNonZero, firstC2Idx);
> +                    X265_CHECK(sum == costCoeffRemain_c0(absCoeff,
> numNonZero), "costCoeffRemain check failure\n");
>                      m_fracBits += ((uint64_t)sum << 15);
>                  }
>              }
> @@ -1771,6 +1826,9 @@
>                      if (symbol1 + firstC2Flag == 3)
>                          firstC2Flag = symbol2;
>
> +                    if (symbol1 + firstC2Idx == 9)
> +                        firstC2Idx  = idx;
> +
>                      c1 = (c1Next & 3);
>                      c1Next >>= 2;
>                      X265_CHECK(c1 <= 3, "c1 check failure\n");
> @@ -1793,15 +1851,17 @@
>                  {
>                      // Standard path
>                      uint32_t goRiceParam = 0;
> +                    int baseLevel = 3;
> +#if CHECKED_BUILD || _DEBUG
>                      int firstCoeff2 = 1;
> -                    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode
> format baseLevel
> -
> -                    idx = 0;
> +#endif
> +                    idx = firstC2Idx;
>                      do
>                      {
> -                        int baseLevel = (baseLevelN & 3) | firstCoeff2;
> +                        if (idx >= C1FLAG_NUMBER)
> +                            baseLevel = 1;
> +                        // TODO: fast algorithm maybe broken this check
> logic
>                          X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ?
> (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
> -                        baseLevelN >>= 2;
>
>                          if (absCoeff[idx] >= baseLevel)
>                          {
> @@ -1810,8 +1870,10 @@
>                                  goRiceParam = (goRiceParam + 1) -
> (goRiceParam >> 2);
>                              X265_CHECK(goRiceParam <= 4, "goRiceParam
> check failure\n");
>                          }
> -                        if (absCoeff[idx] >= 2)
> -                            firstCoeff2 = 0;
> +#if CHECKED_BUILD || _DEBUG
> +                        firstCoeff2 = 0;
> +#endif
> +                        baseLevel = 2;
>                          idx++;
>                      }
>                      while(idx < numNonZero);
>
> _______________________________________________
> x265-devel mailing list
> [email protected]
> https://mailman.videolan.org/listinfo/x265-devel
>

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Re: [x265] [PATCH 6 of 6] asm: improve costCoeffRemain by bypass uncoded coeff

Reply via email to