[x265] [PATCH 1 of 2] [slice] slice feature in help menu
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1473280042 18000 # Node ID 4df3ce7b92dbbc4f1001742dcc3358a4edd6074c # Parent 7f9aeed70c0d4d923a566561964edc71e41f4f28 [slice] slice feature in help menu --- source/x265cli.h |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff -r 7f9aeed70c0d -r 4df3ce7b92db source/x265cli.h --- a/source/x265cli.h Tue Sep 06 11:36:39 2016 +0200 +++ b/source/x265cli.h Wed Sep 07 15:27:22 2016 -0500 @@ -2,6 +2,7 @@ * Copyright (C) 2013 x265 project * * Authors: Steve Borho <st...@borho.org> + * Min Chen <chenm...@163.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -301,6 +302,7 @@ H0(" '-' implies no threads on node, '+' implies one thread per core on node\n"); H0("-F/--frame-threads Number of concurrently encoded frames. 0: auto-determined by core count\n"); H0(" --[no-]wppEnable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront)); +H0(" --[no-]slicesEnable Multiple Slices feature. Default %s\n", OPT(param->maxSlices)); H0(" --[no-]pmode Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis)); H0(" --[no-]pmeParallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation)); H0(" --[no-]asm <bool|int|string> Override CPU detection. Default: auto\n"); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 01 of 18] [slices] new option --slices
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1472668671 18000 # Node ID cc9c2243e589d6bb41c9eb5518000826f8970551 # Parent 49a0d1176aef5bc6330fcfd39b4589616c174f0a [slices] new option --slices --- source/common/param.cpp |6 ++ source/x265.h |3 +++ source/x265cli.h|1 + 3 files changed, 10 insertions(+), 0 deletions(-) diff -r 49a0d1176aef -r cc9c2243e589 source/common/param.cpp --- a/source/common/param.cpp Wed Jul 27 21:47:20 2016 +0200 +++ b/source/common/param.cpp Wed Aug 31 13:37:51 2016 -0500 @@ -251,6 +251,7 @@ param->maxFALL = 0; param->minLuma = 0; param->maxLuma = PIXEL_MAX; +param->maxSlices = 1; } int x265_param_default_preset(x265_param* param, const char* preset, const char* tune) @@ -888,6 +889,7 @@ OPT("min-luma") p->minLuma = (uint16_t)atoi(value); OPT("max-luma") p->maxLuma = (uint16_t)atoi(value); OPT("uhd-bd") p->uhdBluray = atobool(value); +OPT("slices") p->maxSlices = atoi(value); else bExtraParams = true; if (bExtraParams) @@ -1223,6 +1225,8 @@ "qpmax exceeds supported range (0 to 69)"); CHECK(param->rc.qpMin < QP_MIN || param->rc.qpMin > QP_MAX_MAX, "qpmin exceeds supported range (0 to 69)"); +CHECK(1 > param->maxSlices || param->maxSlices > ((param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize), +"The slices can not be more than rows"); return check_failed; } @@ -1373,6 +1377,8 @@ TOOLOPT(param->bEnableFastIntra, "fast-intra"); TOOLOPT(param->bEnableStrongIntraSmoothing, "strong-intra-smoothing"); TOOLVAL(param->lookaheadSlices, "lslices=%d"); +if (param->maxSlices > 1) +TOOLVAL(param->maxSlices, "slices=%d"); if (param->bEnableLoopFilter) { if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset) diff -r 49a0d1176aef -r cc9c2243e589 source/x265.h --- a/source/x265.h Wed Jul 27 21:47:20 2016 +0200 +++ b/source/x265.h Wed Aug 31 13:37:51 2016 -0500 @@ -1294,6 +1294,9 @@ * value to that value. */ uint16_t maxLuma; +/* Maximum count of Slices of picture, the value range is [1, maximum rows] */ +unsigned int maxSlices; + } x265_param; /* x265_param_alloc: diff -r 49a0d1176aef -r cc9c2243e589 source/x265cli.h --- a/source/x265cli.h Wed Jul 27 21:47:20 2016 +0200 +++ b/source/x265cli.h Wed Aug 31 13:37:51 2016 -0500 @@ -232,6 +232,7 @@ { "no-temporal-layers", no_argument, NULL, 0 }, { "qg-size",required_argument, NULL, 0 }, { "recon-y4m-exec", required_argument, NULL, 0 }, +{ "slices", required_argument, NULL, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 01 of 18] [slices] new option --slices
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1472668671 18000 # Node ID cc9c2243e589d6bb41c9eb5518000826f8970551 # Parent 49a0d1176aef5bc6330fcfd39b4589616c174f0a [slices] new option --slices --- source/common/param.cpp |6 ++ source/x265.h |3 +++ source/x265cli.h|1 + 3 files changed, 10 insertions(+), 0 deletions(-) diff -r 49a0d1176aef -r cc9c2243e589 source/common/param.cpp --- a/source/common/param.cpp Wed Jul 27 21:47:20 2016 +0200 +++ b/source/common/param.cpp Wed Aug 31 13:37:51 2016 -0500 @@ -251,6 +251,7 @@ param->maxFALL = 0; param->minLuma = 0; param->maxLuma = PIXEL_MAX; +param->maxSlices = 1; } int x265_param_default_preset(x265_param* param, const char* preset, const char* tune) @@ -888,6 +889,7 @@ OPT("min-luma") p->minLuma = (uint16_t)atoi(value); OPT("max-luma") p->maxLuma = (uint16_t)atoi(value); OPT("uhd-bd") p->uhdBluray = atobool(value); +OPT("slices") p->maxSlices = atoi(value); else bExtraParams = true; if (bExtraParams) @@ -1223,6 +1225,8 @@ "qpmax exceeds supported range (0 to 69)"); CHECK(param->rc.qpMin < QP_MIN || param->rc.qpMin > QP_MAX_MAX, "qpmin exceeds supported range (0 to 69)"); +CHECK(1 > param->maxSlices || param->maxSlices > ((param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize), +"The slices can not be more than rows"); return check_failed; } @@ -1373,6 +1377,8 @@ TOOLOPT(param->bEnableFastIntra, "fast-intra"); TOOLOPT(param->bEnableStrongIntraSmoothing, "strong-intra-smoothing"); TOOLVAL(param->lookaheadSlices, "lslices=%d"); +if (param->maxSlices > 1) +TOOLVAL(param->maxSlices, "slices=%d"); if (param->bEnableLoopFilter) { if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset) diff -r 49a0d1176aef -r cc9c2243e589 source/x265.h --- a/source/x265.h Wed Jul 27 21:47:20 2016 +0200 +++ b/source/x265.h Wed Aug 31 13:37:51 2016 -0500 @@ -1294,6 +1294,9 @@ * value to that value. */ uint16_t maxLuma; +/* Maximum count of Slices of picture, the value range is [1, maximum rows] */ +unsigned int maxSlices; + } x265_param; /* x265_param_alloc: diff -r 49a0d1176aef -r cc9c2243e589 source/x265cli.h --- a/source/x265cli.h Wed Jul 27 21:47:20 2016 +0200 +++ b/source/x265cli.h Wed Aug 31 13:37:51 2016 -0500 @@ -232,6 +232,7 @@ { "no-temporal-layers", no_argument, NULL, 0 }, { "qg-size",required_argument, NULL, 0 }, { "recon-y4m-exec", required_argument, NULL, 0 }, +{ "slices", required_argument, NULL, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: fix typo mistake
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1469647787 18000 # Node ID 2aa5421a3500c3b1912db1db94208e354c6954f5 # Parent 5a0e139e29386ecebafc9c555aedcd3e0f61c70c asm: fix typo mistake --- source/common/arm/dct-a.S |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff -r 5a0e139e2938 -r 2aa5421a3500 source/common/arm/dct-a.S --- a/source/common/arm/dct-a.S Fri Jul 22 13:13:42 2016 +0530 +++ b/source/common/arm/dct-a.S Wed Jul 27 14:29:47 2016 -0500 @@ -216,7 +216,7 @@ vqrshrn.s32 d19, q14, 2 vqrshrn.s32 d23, q15, 2 -vstm r1!, {d16-d23] +vstm r1!, {d16-d23} // bottom half vld1.16 {q12}, [r0], r2 @@ -262,7 +262,7 @@ vqrshrn.s32 d19, q14, 2 vqrshrn.s32 d23, q15, 2 -vstm r1, {d16-d23] +vstm r1, {d16-d23} mov r1, r3 // DCT-2D ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] Reduce operators on row address compare
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1467393737 18000 # Node ID c3ed095a6e735f2d95fa7571ab16e3d510a4f5d2 # Parent 836a870ba76b46d4c0078289e320db1371fc3403 Reduce operators on row address compare --- source/common/cudata.cpp |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff -r 836a870ba76b -r c3ed095a6e73 source/common/cudata.cpp --- a/source/common/cudata.cpp Fri Jul 01 11:49:57 2016 +0530 +++ b/source/common/cudata.cpp Fri Jul 01 12:22:17 2016 -0500 @@ -68,7 +68,7 @@ inline bool isEqualRow(int addrA, int addrB, int numUnits) { // addrA / numUnits == addrB / numUnits -return ((addrA ^ addrB) & ~(numUnits - 1)) == 0; +return ((addrA ^ addrB) < numUnits); } /* Check whether 2 addresses point to the same row or column */ @@ -88,7 +88,7 @@ inline bool isZeroRow(int addr, int numUnits) { // addr / numUnits == 0 -return (addr & ~(numUnits - 1)) == 0; +return (addr < numUnits); } /* Check whether one address points to a column whose index is smaller than a given value */ ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: fix output change bug in pixel_sa8d_32x32, the reason is intermedia result overflow
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1467233520 18000 # Node ID a4f46c182d42080d6674b665cedfd8ec90a47e62 # Parent 626fcbac7ffba723dabd3a9f0507c4c80f3e7bc9 asm: fix output change bug in pixel_sa8d_32x32, the reason is intermedia result overflow --- source/common/x86/pixel-a.asm | 48 +++- 1 files changed, 32 insertions(+), 16 deletions(-) diff -r 626fcbac7ffb -r a4f46c182d42 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Thu Jun 16 12:57:38 2016 +0530 +++ b/source/common/x86/pixel-a.asm Wed Jun 29 15:52:00 2016 -0500 @@ -14041,10 +14041,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m10, m0, m2 @@ -14083,10 +14085,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m12, m0, m2 @@ -14125,10 +14129,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m12, m0 paddd m12, m2 @@ -14171,10 +14177,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m10, m0 paddd m10, m2 @@ -14218,10 +14226,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m12, m0, m2 @@ -14260,10 +14270,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m13, m0, m2 @@ -14302,10 +14314,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m13, m0 paddd m13, m2 @@ -14348,10 +14362,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m12, m0 paddd m12, m2 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] cmake: support IPhone cross compile on Mac OS X platform
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1467135300 18000 # Node ID 8974c28e7d6ac481028a860a415a31eb64885043 # Parent 626fcbac7ffba723dabd3a9f0507c4c80f3e7bc9 cmake: support IPhone cross compile on Mac OS X platform diff -r 626fcbac7ffb -r 8974c28e7d6a build/arm-ios/ios.cmake --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/build/arm-ios/ios.cmake Tue Jun 28 12:35:00 2016 -0500 @@ -0,0 +1,18 @@ +# CMake toolchain file for cross compiling x265 for ARM arch +# This feature is only supported as experimental. Use with caution. +# Please report bugs on bitbucket +# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source + +set(CROSS_COMPILE_ARM 1) +set(CMAKE_SYSTEM_NAME Darwin) +set(CMAKE_SYSTEM_PROCESSOR armv7l) +set(CMAKE_OSX_DEPLOYMENT_TARGET 0) + +# specify the cross compiler +set(CMAKE_C_COMPILER clang) +set(CMAKE_CXX_COMPILER clang++) + +# specify the target environment +set(CMAKE_FIND_ROOT_PATH ${IOS_PLATFORM_SDK}) + + diff -r 626fcbac7ffb -r 8974c28e7d6a build/arm-ios/make-Makefiles.bash --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/build/arm-ios/make-Makefiles.bash Tue Jun 28 12:35:00 2016 -0500 @@ -0,0 +1,7 @@ +#!/bin/bash +# Run this from within a bash shell + +IOS_PLATFORM_SDK=`xcrun --show-sdk-path --sdk iphoneos` +IOS_CFLAGS="-arch armv7 -mfpu=neon -pthread -miphoneos-version-min=7.0 -Qunused-arguments" + +cmake -DCMAKE_TOOLCHAIN_FILE=ios.cmake -DCMAKE_C_FLAGS="$IOS_CFLAGS" -DCMAKE_CXX_FLAGS="$IOS_CFLAGS" -DCMAKE_OSX_SYSROOT="$IOS_PLATFORM_SDK" -DIOS_PLATFORM_SDK="$IOS_PLATFORM_SDK" -G "Unix Makefiles" ../../source && ccmake ../../source diff -r 626fcbac7ffb -r 8974c28e7d6a source/CMakeLists.txt --- a/source/CMakeLists.txt Thu Jun 16 12:57:38 2016 +0530 +++ b/source/CMakeLists.txt Tue Jun 28 12:35:00 2016 -0500 @@ -187,7 +187,12 @@ endif() endif() if(ARM AND CROSS_COMPILE_ARM) -set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm) +message(STATUS ${PLATFORM_LIBS}) +if(APPLE) +#set(ARM_ARGS -arch armv7 -arch armv7s -arch arm64) +else() +set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm) +endif() elseif(ARM) find_package(Neon) if(CPU_HAS_NEON) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] cmake: support IPhone cross compile on Mac OS X platform
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1466721751 18000 # Node ID c4215deac61f8a4d56d0e7247b9913276fc9555b # Parent 626fcbac7ffba723dabd3a9f0507c4c80f3e7bc9 cmake: support IPhone cross compile on Mac OS X platform diff -r 626fcbac7ffb -r c4215deac61f build/arm-ios/ios.cmake --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/build/arm-ios/ios.cmake Thu Jun 23 17:42:31 2016 -0500 @@ -0,0 +1,17 @@ +# CMake toolchain file for cross compiling x265 for ARM arch +# This feature is only supported as experimental. Use with caution. +# Please report bugs on bitbucket +# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source + +set(CROSS_COMPILE_ARM 1) +set(CMAKE_SYSTEM_NAME Darwin) +set(CMAKE_SYSTEM_PROCESSOR armv7l) + +# specify the cross compiler +set(CMAKE_C_COMPILER clang) +set(CMAKE_CXX_COMPILER clang++) + +# specify the target environment +set(CMAKE_FIND_ROOT_PATH ${IOS_PLATFORM_SDK}) + + diff -r 626fcbac7ffb -r c4215deac61f build/arm-ios/make-Makefiles.bash --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/build/arm-ios/make-Makefiles.bash Thu Jun 23 17:42:31 2016 -0500 @@ -0,0 +1,7 @@ +#!/bin/bash +# Run this from within a bash shell + +IOS_PLATFORM_SDK=`xcrun --show-sdk-path --sdk iphoneos` +IOS_CFLAGS="-arch armv7 -mfpu=neon -pthread -isysroot $IOS_PLATFORM_SDK -miphoneos-version-min=7.0 -Qunused-arguments" + +cmake -DCMAKE_TOOLCHAIN_FILE=ios.cmake -DCMAKE_C_FLAGS="$IOS_CFLAGS" -DCMAKE_CXX_FLAGS="$IOS_CFLAGS" -DIOS_PLATFORM_SDK="$IOS_PLATFORM_SDK" -G "Unix Makefiles" ../../source && ccmake ../../source diff -r 626fcbac7ffb -r c4215deac61f source/CMakeLists.txt --- a/source/CMakeLists.txt Thu Jun 16 12:57:38 2016 +0530 +++ b/source/CMakeLists.txt Thu Jun 23 17:42:31 2016 -0500 @@ -187,7 +187,11 @@ endif() endif() if(ARM AND CROSS_COMPILE_ARM) -set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm) +if(APPLE) +#set(ARM_ARGS -arch armv7 -arch armv7s -arch arm64) +else() +set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm) +endif() elseif(ARM) find_package(Neon) if(CPU_HAS_NEON) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm_arm: NEON version of dct[16x16]
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1466207075 18000 # Node ID 9475503fb46c1b4118441c62797799c454534e62 # Parent 78ffb67a844e3e76facf18c52790f1bd544754d6 asm_arm: NEON version of dct[16x16] --- source/common/arm/asm-primitives.cpp |1 + source/common/arm/dct-a.S| 522 +- source/common/arm/dct8.h |1 + 3 files changed, 513 insertions(+), 11 deletions(-) diff -r 78ffb67a844e -r 9475503fb46c source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Fri Jun 10 15:53:28 2016 +0530 +++ b/source/common/arm/asm-primitives.cpp Fri Jun 17 18:44:35 2016 -0500 @@ -1007,6 +1007,7 @@ p.cu[BLOCK_4x4].dct = PFX(dct_4x4_neon); p.cu[BLOCK_8x8].dct = PFX(dct_8x8_neon); +p.cu[BLOCK_16x16].dct = PFX(dct_16x16_neon); #if !HIGH_BIT_DEPTH p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon); #endif // !HIGH_BIT_DEPTH diff -r 78ffb67a844e -r 9475503fb46c source/common/arm/dct-a.S --- a/source/common/arm/dct-a.S Fri Jun 10 15:53:28 2016 +0530 +++ b/source/common/arm/dct-a.S Fri Jun 17 18:44:35 2016 -0500 @@ -120,17 +120,6 @@ bx lr endfunc -.align 4 -ctr4: -.word 83// d0[0] = 83 -.word 36// d0[1] = 36 -ctr8: -.word 75// d1[0] = 75 -.word 89// d1[1] = 89 -.word 18// d2[0] = 18 -.word 50// d2[1] = 50 - - /* uses registers q4 - q7 for temp values */ .macro tr4 r0, r1, r2, r3 vsub.s32q8, \r0, \r3// EO0 @@ -398,3 +387,514 @@ bx lr endfunc + +.align 8 +pw_tr16: .hword 90, 87, 80, 70, 57, 43, 25, 9 // q0 = [ 9 25 43 57 70 80 87 90] + .hword 83, 36, 75, 89, 18, 50, 00, 00 // q1 = [ x x 50 18 89 75 36 83] + +.align 8 +ctr4: +.word 83// d0[0] = 83 +.word 36// d0[1] = 36 +ctr8: +.word 75// d1[0] = 75 +.word 89// d1[1] = 89 +.word 18// d2[0] = 18 +.word 50// d2[1] = 50 +ctr16: +.word 90, 87// d0 +.word 80, 70// d1 +.word 57, 43// d2 +.word 25, 9// d3 + +/* void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */ +function x265_dct_16x16_neon +push {lr} + +// fill 3 of pipeline stall cycles (dependency link on SP) +add r2, r2 +adr r3, pw_tr16 +mov r12, #16/4 + +vpush {q4-q7} + +// TODO: 16x16 transpose buffer (may share with input buffer in future) +sub sp, #16*16*2 + +vld1.16 {d0-d3}, [r3] +mov r3, sp +mov lr, #4*16*2 + +// DCT-1D +.loop1: +// Row[0-3] +vld1.16 {q8-q9}, [r0, :64], r2 // q8 = [07 06 05 04 03 02 01 00], q9 = [0F 0E 0D 0C 0B 0A 09 08] +vld1.16 {q10-q11}, [r0, :64], r2// q10 = [17 16 15 14 13 12 11 10], q11 = [1F 1E 1D 1C 1B 1A 19 18] +vld1.16 {q12-q13}, [r0, :64], r2// q12 = [27 26 25 24 23 22 21 20], q13 = [2F 2E 2D 2C 2B 2A 29 28] +vld1.16 {q14-q15}, [r0, :64], r2// q14 = [37 36 35 34 33 32 31 30], q15 = [3F 3E 3D 3C 3B 3A 39 38] + +// Register map +// | 16 17 18 19 | +// | 20 21 22 23 | +// | 24 25 26 27 | +// | 28 29 30 31 | + +// Transpose 16x4 +vtrn.32 q8, q12 // q8 = [25 24 05 04 21 20 01 00], q12 = [27 26 07 06 23 22 03 02] +vtrn.32 q10, q14// q10 = [35 34 15 14 31 30 11 10], q14 = [37 36 17 16 33 32 13 12] +vtrn.32 q9, q13 // q9 = [2D 2C 0D 0C 29 28 09 08], q13 = [2F 2E 0F 0E 2B 2A 0B 0A] +vtrn.32 q11, q15// q11 = [3D 3C 1D 1C 39 38 19 18], q15 = [3F 3E 1F 1E 3B 3A 1B 1A] + +vtrn.16 q8, q10 // q8 = [34 24 14 04 30 20 10 00], q10 = [35 25 15 05 31 21 11 01] +vtrn.16 q12, q14// q12 = [36 26 16 06 32 22 12 02], q14 = [37 27 17 07 33 23 13 03] +vtrn.16 q13, q15// q13 = [3E 2E 1E 0E 3A 2A 1A 0A], q15 = [3F 2F 1F 0F 3B 2B 1B 0B] +vtrn.16 q9, q11 // q9 = [3C 2C 1C 0C 38 28 18 08], q11 = [3D 2D 1D 0D 39 29 19 09] + +vswp d26, d27 // q13 = [3A 2A 1A 0A 3E 2E 1E 0E] +vswp d30, d31 // q15 = [3B 2B 1B 0B 3F 2F 1F 0F] +vswp d18, d19 // q9 = [38 28 18 08 3C 2C 1C 0C] +vswp d22, d23 // q11 = [39 29 19 09 3D 2D 1D 0D] + +// E[0-7] - 10 bits +vadd.s16 q4, q8, q15// q4 = [E4 E0] +vadd.s16 q5, q10, q13 // q5 = [E5 E1] +vadd.s16 q6, q12, q11 // q6 = [E6 E2] +vadd.s16 q7, q14, q9// q7 = [E7 E3] + +// O[0-7] - 10 bits +vsub.s16 q8, q8, q15// q8 = [O4 O0] +vsub.s16 q9, q14, q9// q9 = [O7 O3] +vsub.s16 q10, q10, q13 // q10 = [O5 O1] +vsub.s16 q11, q12, q11 // q11 = [O6 O2] + +// reorder Ex for
[x265] [PATCH 2 of 2] asm: fix output change bug in pixel_sa8d_16x16, the reason is intermedia result overflow
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1465938978 18000 # Node ID 362976c6bf6853e75cec0e94e48941eed4737269 # Parent 3d8e1d324c9f4bd50eeb1addf85507b668ef3fe9 asm: fix output change bug in pixel_sa8d_16x16, the reason is intermedia result overflow --- source/common/x86/pixel-a.asm | 22 +- 1 files changed, 13 insertions(+), 9 deletions(-) diff -r 3d8e1d324c9f -r 362976c6bf68 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Tue Jun 14 16:16:14 2016 -0500 +++ b/source/common/x86/pixel-a.asm Tue Jun 14 16:16:18 2016 -0500 @@ -13910,7 +13910,50 @@ lea r7, [r2+4*r3] vbroadcasti128 m7, [pw_1] -;call pixel_sa8d_8x8_internal ; pix[0] +; Top 16x8 +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] ; 10 bits +movu m5, [r2] +psubw m0, m5; 11 bits +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psubw m3, m11 +movu m9, [r6 + r4] +movu m6, [r7 + r5] +psubw m9, m6 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax; 16 bits +pmaddwd m0, m7 +pmaddwd m1, m7 +pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 +paddd m10, m0, m2 + +lea r0, [r0+8*r1] +lea r2, [r2+8*r3] +lea r6, [r6+8*r1] +lea r7, [r7+8*r3] + +; Bottom 16x8 ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 movu m0, [r0] movu m5, [r2] @@ -13940,51 +13983,12 @@ psubw m9, m6 HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 pmaddwd m0, m7 +pmaddwd m1, m7 pmaddwd m2, m7 -paddd m10, m0, m2 - -lea r0, [r0+8*r1] -lea r2, [r2+8*r3] -lea r6, [r6+8*r1] -lea r7, [r7+8*r3] - -;call pixel_sa8d_8x8_internal ; pix[8*stride+8] -;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 -movu m0, [r0] -movu m5, [r2] -psubw m0, m5 -movu m1, [r0 + r1] -movu m6, [r2 + r3] -psubw m1, m6 -movu m2, [r0 + r1 * 2] -movu m5, [r2 + r3 * 2] -psubw m2, m5 -movu m8, [r0 + r4] -movu m6, [r2 + r5] -psubw m8, m6 - -;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 -movu m4, [r6] -movu m11, [r7] -psubw m4, m11 -movu m5, [r6 + r1] -movu m6, [r7 + r3] -psubw m5, m6 -movu m3, [r6 + r1 * 2] -movu m11, [r7 + r3 * 2] -psubw m3, m11 -movu m9, [r6 + r4] -movu m6, [r7 + r5] -psubw m9, m6 - -HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax -paddw m0, m1 -paddw m2, m8 -pmaddwd m0, m7 -pmaddwd m2, m7 +pmaddwd m8, m7 +paddd m0, m1 +paddd m2, m8 paddd m10, m0 paddd m10, m2 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 2] fix undefined INT64_MAX in VS2008
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1465938974 18000 # Node ID 3d8e1d324c9f4bd50eeb1addf85507b668ef3fe9 # Parent 106a5a7dc4b337121c11484bc3bc4900b8a0d9a4 fix undefined INT64_MAX in VS2008 --- source/compat/msvc/stdint.h |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff -r 106a5a7dc4b3 -r 3d8e1d324c9f source/compat/msvc/stdint.h --- a/source/compat/msvc/stdint.h Thu Jun 09 13:34:55 2016 -0500 +++ b/source/compat/msvc/stdint.h Tue Jun 14 16:16:14 2016 -0500 @@ -8,6 +8,7 @@ #if !defined(UINT64_MAX) #include #define UINT64_MAX _UI64_MAX +#define INT64_MAX _I64_MAX #define INT16_MAX _I16_MAX #endif ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 2] asm_arm: rewrite NEON version of count_nonzero*
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1465497292 18000 # Node ID 7dce8656504fdf8d25f67c7a97b781a031bbdf8a # Parent 0af296185f7ae3e05493ecf164046ddfec085bb3 asm_arm: rewrite NEON version of count_nonzero* Origin: count_nonzero[4x4] 1.09x20.88 22.69 count_nonzero[8x8] 1.25x22.65 28.23 count_nonzero[16x16]1.74x30.91 53.67 count_nonzero[32x32]2.31x64.70 149.60 New: count_nonzero[4x4] 1.13x20.04 22.71 count_nonzero[8x8] 1.33x21.02 27.95 count_nonzero[16x16]2.02x26.72 53.95 count_nonzero[32x32]2.95x50.76 149.61 --- source/common/arm/blockcopy8.S | 230 +++- 1 files changed, 158 insertions(+), 72 deletions(-) diff -r 0af296185f7a -r 7dce8656504f source/common/arm/blockcopy8.S --- a/source/common/arm/blockcopy8.STue Jun 07 09:20:11 2016 +0530 +++ b/source/common/arm/blockcopy8.SThu Jun 09 13:34:52 2016 -0500 @@ -664,89 +664,175 @@ // int count_nonzero_c(const int16_t* quantCoeff) function x265_count_nonzero_4_neon -veord4, d4 -.rept 2 -vld1.s16{d0}, [r0]! -vld1.s16{d1}, [r0]! -vclz.i16d2, d0 -vclz.i16d3, d1 -vshr.u16q1, #4 -vadd.u16d2, d3 -vadd.u16d4, d2 -.endr -vpadd.u16 d4, d4 -vpadd.u16 d4, d4 -vmov.u16r12, d4[0] -rsb r0, r12, #16 +vld1.s16{d0-d3}, [r0] +vceq.u16q0, #0 +vceq.u16q1, #0 +eor r1, r1 +vtrn.8 q0, q1 + +vshr.u8 q0, #7 + +vadd.u8 d0, d1 +vshr.u64d1, d0, #32 +vadd.u8 d0, d1 +vmov.u32r0, d0[0] +usad8 r0, r0, r1 +rsb r0, #16 bx lr endfunc function x265_count_nonzero_8_neon -veorq8, q8 -.rept 4 -vld1.s16{q0}, [r0]! -vld1.s16{q1}, [r0]! -vclz.i16q2, q0 -vclz.i16q3, q1 -vshr.u16q2, #4 -vshr.u16q3, #4 -vadd.u16q2, q3 -vadd.u16q8, q2 -.endr -vadd.u16d16, d17 -vpadd.u16 d16, d16 -vpadd.u16 d16, d16 -vmov.u16r12, d16[0] -rsb r0, r12, #64 +vldmr0, {q8-q15} +eor r1, r1 +vceq.u16q8, #0 +vceq.u16q9, #0 +vceq.u16q10, #0 +vceq.u16q11, #0 +vceq.u16q12, #0 +vceq.u16q13, #0 +vceq.u16q14, #0 +vceq.u16q15, #0 + +vtrn.8 q8, q9 +vtrn.8 q10, q11 +vtrn.8 q12, q13 +vtrn.8 q14, q15 + +vadd.s8 q8, q10 +vadd.s8 q12, q14 +vadd.s8 q8, q12 + +vadd.s8 d16, d17 +vshr.u64d17, d16, #32 +vadd.s8 d16, d17 +vabs.s8 d16, d16 + +vmov.u32r0, d16[0] +usad8 r0, r0, r1 +rsb r0, #64 bx lr endfunc function x265_count_nonzero_16_neon -veorq2, q2 -.rept 16 -vld1.s16{q0, q1}, [r0]! -vclz.i16q8, q0 -vclz.i16q9, q1 -vshr.u16q8, #4 -vshr.u16q9, #4 -vadd.u16q8, q9 -vadd.u16q2, q8 +vldmr0!, {q8-q15} +eor r1, r1 +vceq.u16q8, #0 +vceq.u16q9, #0 +vceq.u16q10, #0 +vceq.u16q11, #0 +vceq.u16q12, #0 +vceq.u16q13, #0 +vceq.u16q14, #0 +vceq.u16q15, #0 + +vtrn.8 q8, q9 +vtrn.8 q10, q11 +vtrn.8 q12, q13 +vtrn.8 q14, q15 + +vmovq0, q8 +vmovq1, q10 +vmovq2, q12 +vmovq3, q14 + +.rept 3 +vldmr0!, {q8-q15} +vceq.u16q8, #0 +vceq.u16q9, #0 +vceq.u16q10, #0 +vceq.u16q11, #0 +vceq.u16q12, #0 +vceq.u16q13, #0 +vceq.u16q14, #0 +vceq.u16q15, #0 + +vtrn.8 q8, q9 +vtrn.8 q10, q11 +vtrn.8 q12, q13 +vtrn.8 q14, q15 + +vadd.s8 q0, q8 +vadd.s8 q1, q10 +vadd.s8 q2, q12 +vadd.s8 q3, q14 .endr -vadd.u16d4, d5 -vpadd.u16 d4, d4 -vpadd.u16 d4, d4 -vmov.u16r12, d4[0] -rsb r0, r12, #256 +vadd.s8 q0, q1 +vadd.s8 q2, q3 +vadd.s8 q0, q2 // dynamic range is 4+1 bits + +vadd.s8 d0, d1 +vshr.u64d1, d0, #32 +vadd.s8 d0, d1 +vabs.s8 d0, d0 // maximum value of each element are 64 + +vmov.u32r0, d0[0] +
[x265] [PATCH 2 of 2] asm_arm: rewrite NEON version of dequant_normal
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1465497295 18000 # Node ID a22130631abb598b10f3f0beecf92af223d778fe # Parent 7dce8656504fdf8d25f67c7a97b781a031bbdf8a asm_arm: rewrite NEON version of dequant_normal OLD: dequant_normal 9.87x199.80 1971.87 NEW: dequant_normal 16.16x 122.04 1971.56 --- source/common/arm/pixel-util.S | 60 ++-- 1 files changed, 27 insertions(+), 33 deletions(-) diff -r 7dce8656504f -r a22130631abb source/common/arm/pixel-util.S --- a/source/common/arm/pixel-util.SThu Jun 09 13:34:52 2016 -0500 +++ b/source/common/arm/pixel-util.SThu Jun 09 13:34:55 2016 -0500 @@ -2293,44 +2293,38 @@ // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift) function x265_dequant_normal_neon -push{r4, r5, r6} -ldr r4, [sp, #12] // shift -#if HIGH_BIT_DEPTH -cmp r3, #32767 -jle .skip -shr r3, (BIT_DEPTH - 8) -sub r4, (BIT_DEPTH - 8) -.skip: +ldr r12, [sp]// shift +#if HIGH_BIT_DEPTH // NEVER TEST path +cmp r3, #32768 +lsrlt r3, #(BIT_DEPTH - 8) +sublt r12, #(BIT_DEPTH - 8) #endif -mov r12, #1 -sub r5, r4, #1 -lsr r2, #3 // num / 8 -lsl r5, r12, r5 // 1 << shift - 1 +lsr r2, #4 // num / 16 -neg r6, r4 -vdup.32 q0, r3 -vdup.32 q1, r6 -vdup.32 q2, r5 +neg r12, r12 +vdup.16 q0, r3 +vdup.32 q1, r12 -dqn_loop1: -vld1.16 {q3}, [r0]! -vmovl.s16 q8, d6 -vmovl.s16 q9, d7 +.dqn_loop1: +vld1.16 {d4-d7}, [r0]! -vmul.s32q8, q0 -vmul.s32q9, q0 -vadd.s32q8, q2 -vadd.s32q9, q2 +vmull.s16 q8, d4, d0 +vmull.s16 q9, d5, d0 +vmull.s16 q10, d6, d0 +vmull.s16 q11, d7, d0 -vshl.s32q8, q1 -vshl.s32q9, q1 +vrshl.s32 q8, q1 +vrshl.s32 q9, q1 +vrshl.s32 q10, q1 +vrshl.s32 q11, q1 vqmovn.s32 d16, q8 vqmovn.s32 d17, q9 +vqmovn.s32 d18, q10 +vqmovn.s32 d19, q11 subsr2, #1 -vst1.16 {q8}, [r1]! -bne dqn_loop1 -pop {r4, r5, r6} +vst1.16 {d16-d19}, [r1]! +bgt.dqn_loop1 bx lr endfunc ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2] asm_arm: redesign algorithm and rewrite interp_8tap_vert_pp_4xN
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1463589741 18000 # Node ID 1fbcfda38731342670911c738342d6e57f75467c # Parent 46c45f236ab0b25ec92a892f12315024eae2a11d asm_arm: redesign algorithm and rewrite interp_8tap_vert_pp_4xN Origin: luma_vpp[ 4x4] 1.87x45.23 84.41 luma_vpp[ 4x8] 2.10x70.36 147.78 luma_vpp[ 4x16] 2.25x121.24 272.18 New: luma_vpp[ 4x4] 3.10x27.47 85.05 luma_vpp[ 4x8] 4.59x32.21 147.76 luma_vpp[ 4x16] 6.38x42.73 272.48 --- source/common/arm/ipfilter8.S | 157 +--- 1 files changed, 82 insertions(+), 75 deletions(-) diff -r 46c45f236ab0 -r 1fbcfda38731 source/common/arm/ipfilter8.S --- a/source/common/arm/ipfilter8.S Wed May 18 11:42:18 2016 -0500 +++ b/source/common/arm/ipfilter8.S Wed May 18 11:42:21 2016 -0500 @@ -3,6 +3,7 @@ * * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com> * Radhakrishnan VR <radhakrish...@multicorewareinc.com> + * Min Chen <min.c...@multicorewareinc.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -42,6 +43,7 @@ .word -2, -2, 16, 16, 54, 54, -4 ,-4 .word -2, -2, 10, 10, 58, 58, -2, -2 + .text // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride) @@ -709,85 +711,90 @@ endfunc //**luma_vpp +.align 8 +// TODO: I don't like S16 in here, but the VMUL with scalar doesn't support (U8 x U8) +g_luma_s16: +.hword 0, 0, 0, 64, 0, 0, 0, 0 +.hword -1, 4, -10, 58, 17, -5, 1, 0 +.hword -1, 4, -11, 40, 40, -11, 4, -1 +.hword 0, 1, -5, 17, 58, -10, 4, -1 + .macro LUMA_VPP_4xN h function x265_interp_8tap_vert_pp_4x\h\()_neon -push {r4, r5, lr} -ldr r4, [sp, #4 * 3] -mov r5, r4, lsl #6 -mov r4, r1, lsl #2 -sub r4, r1 -sub r0, r4 +ldr r12, [sp] +push{lr} +adr lr, g_luma_s16 +sub r0, r1 +sub r0, r0, r1, lsl #1 // src -= 3 * srcStride +add lr, lr, r12, lsl #4 +vld1.16 {q0}, [lr, :64] // q8 = luma interpolate coeff +vdup.s16d24, d0[0] +vdup.s16d25, d0[1] +vdup.s16d26, d0[2] +vdup.s16d27, d0[3] +vdup.s16d28, d1[0] +vdup.s16d29, d1[1] +vdup.s16d30, d1[2] +vdup.s16d31, d1[3] -mov r4, #32 -vdup.32 q8, r4 -mov r4, #\h +mov r12, #\h + +// prepare to load 8 lines +vld1.u32{d0[0]}, [r0], r1 +vld1.u32{d0[1]}, [r0], r1 +vld1.u32{d2[0]}, [r0], r1 +vld1.u32{d2[1]}, [r0], r1 +vld1.u32{d4[0]}, [r0], r1 +vld1.u32{d4[1]}, [r0], r1 +vld1.u32{d6[0]}, [r0], r1 +vld1.u32{d6[1]}, [r0], r1 +vmovl.u8q0, d0 +vmovl.u8q1, d2 +vmovl.u8q2, d4 +vmovl.u8q3, d6 .loop_4x\h: -movrel r12, g_lumaFilter -add r12, r5 -mov lr, r0 +// TODO: read extra 1 row for speed optimize, may made crash on OS X platform! +vld1.u32{d16[0]}, [r0], r1 +vld1.u32{d16[1]}, [r0], r1 +vmovl.u8q8, d16 -vld1.u32d0[0], [lr], r1 -vld1.u32d0[1], [lr], r1 -vld1.u32d1[0], [lr], r1 -vld1.u32d1[1], [lr], r1 -vld1.u32d2[0], [lr], r1 -vld1.u32d2[1], [lr], r1 -vld1.u32d3[0], [lr], r1 -vld1.u32d3[1], [lr], r1 +// row[0-1] +vmul.s16q9, q0, q12 +vext.64 q11, q0, q1, 1 +vmul.s16q10, q11, q12 +vmovq0, q1 -veor.u8 q9, q9 +// row[2-3] +vmla.s16q9, q1, q13 +vext.64 q11, q1, q2, 1 +vmla.s16q10, q11, q13 +vmovq1, q2 -vmovl.u8q11, d0 -vmovl.u16 q12, d22 -vmovl.u16 q13, d23 -vld1.s32d20, [r12]! -vmov.s32d21, d20 -vmla.s32q9, q12, q10 -vld1.s32d20, [r12]! -vmov.s32d21, d20 -vmla.s32q9, q13, q10 +// row[4-5] +vmla.s16q9, q2, q14 +vext.64 q11, q2, q3, 1 +vmla.s16q10, q11, q14 +vmovq2, q3 -vmovl.u8q11, d1 -vmovl.u16 q12, d22 -vmovl.u16 q13, d23 -vld1.s32d20, [r12]! -vmov.s32d21, d20 -vmla.s32q9, q12, q10 -vld1.s32d20, [r12]! -vmov.s32d21, d20 -vmla.s32q9, q13, q10 +// row[6-7] +vmla.s16q9, q3, q15 +vext.64 q11, q3, q8, 1 +vmla.s16q10, q11, q15 +vmovq3, q8 -vmovl.u8q11, d2 -vmovl.u16 q12, d22 -vmovl.u16 q13, d23
[x265] [PATCH 1 of 2] asm_arm: improve interp_8tap_vert_pp_4xN By: 1. remove unnecessary cache prefetch instructions pld 2. replace register r6 by lr
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1463589738 18000 # Node ID 46c45f236ab0b25ec92a892f12315024eae2a11d # Parent 28cf9adfc82e3816189b26aaeb907393b2a82ed8 asm_arm: improve interp_8tap_vert_pp_4xN By: 1. remove unnecessary cache prefetch instructions pld 2. replace register r6 by lr Origin: luma_vpp[ 4x4] 1.87x45.23 84.41 luma_vpp[ 4x8] 2.10x70.36 147.78 luma_vpp[ 4x16] 2.25x121.24 272.18 Optimized: luma_vpp[ 4x4] 1.98x42.42 84.02 luma_vpp[ 4x8] 2.32x63.70 147.49 luma_vpp[ 4x16] 2.51x108.39 272.18 --- source/common/arm/ipfilter8.S | 31 +++ 1 files changed, 11 insertions(+), 20 deletions(-) diff -r 28cf9adfc82e -r 46c45f236ab0 source/common/arm/ipfilter8.S --- a/source/common/arm/ipfilter8.S Wed May 18 02:01:34 2016 + +++ b/source/common/arm/ipfilter8.S Wed May 18 11:42:18 2016 -0500 @@ -711,7 +711,7 @@ //**luma_vpp .macro LUMA_VPP_4xN h function x265_interp_8tap_vert_pp_4x\h\()_neon -push {r4, r5, r6} +push {r4, r5, lr} ldr r4, [sp, #4 * 3] mov r5, r4, lsl #6 mov r4, r1, lsl #2 @@ -725,24 +725,16 @@ .loop_4x\h: movrel r12, g_lumaFilter add r12, r5 -mov r6, r0 +mov lr, r0 -pld [r6] -vld1.u32d0[0], [r6], r1 -pld [r6] -vld1.u32d0[1], [r6], r1 -pld [r6] -vld1.u32d1[0], [r6], r1 -pld [r6] -vld1.u32d1[1], [r6], r1 -pld [r6] -vld1.u32d2[0], [r6], r1 -pld [r6] -vld1.u32d2[1], [r6], r1 -pld [r6] -vld1.u32d3[0], [r6], r1 -pld [r6] -vld1.u32d3[1], [r6], r1 +vld1.u32d0[0], [lr], r1 +vld1.u32d0[1], [lr], r1 +vld1.u32d1[0], [lr], r1 +vld1.u32d1[1], [lr], r1 +vld1.u32d2[0], [lr], r1 +vld1.u32d2[1], [lr], r1 +vld1.u32d3[0], [lr], r1 +vld1.u32d3[1], [lr], r1 veor.u8 q9, q9 @@ -795,8 +787,7 @@ subsr4, #1 bne .loop_4x\h -pop {r4, r5, r6} -bx lr +pop {r4, r5, pc} .ltorg endfunc .endm ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2] asm_arm: redesign algorithm and rewrite interp_8tap_vert_pp_4xN
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1463589656 18000 # Node ID 482a330d8c0de2146694f71633814c5df5f2f556 # Parent d6990d957a9958a0b128b8a6d5c6a4954af99bbd asm_arm: redesign algorithm and rewrite interp_8tap_vert_pp_4xN Origin: luma_vpp[ 4x4] 1.87x45.23 84.41 luma_vpp[ 4x8] 2.10x70.36 147.78 luma_vpp[ 4x16] 2.25x121.24 272.18 New: luma_vpp[ 4x4] 3.10x27.47 85.05 luma_vpp[ 4x8] 4.59x32.21 147.76 luma_vpp[ 4x16] 6.38x42.73 272.48 --- source/common/arm/ipfilter8.S | 157 +--- 1 files changed, 82 insertions(+), 75 deletions(-) diff -r d6990d957a99 -r 482a330d8c0d source/common/arm/ipfilter8.S --- a/source/common/arm/ipfilter8.S Wed May 18 11:40:53 2016 -0500 +++ b/source/common/arm/ipfilter8.S Wed May 18 11:40:56 2016 -0500 @@ -3,6 +3,7 @@ * * Authors: Dnyaneshwar G <dnyanesh...@multicorewareinc.com> * Radhakrishnan VR <radhakrish...@multicorewareinc.com> + * Min Chen <min.c...@multicorewareinc.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -42,6 +43,7 @@ .word -2, -2, 16, 16, 54, 54, -4 ,-4 .word -2, -2, 10, 10, 58, 58, -2, -2 + .text // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride) @@ -709,85 +711,90 @@ endfunc //**luma_vpp +.align 8 +// TODO: I don't like S16 in here, but the VMUL with scalar doesn't support (U8 x U8) +g_luma_s16: +.hword 0, 0, 0, 64, 0, 0, 0, 0 +.hword -1, 4, -10, 58, 17, -5, 1, 0 +.hword -1, 4, -11, 40, 40, -11, 4, -1 +.hword 0, 1, -5, 17, 58, -10, 4, -1 + .macro LUMA_VPP_4xN h function x265_interp_8tap_vert_pp_4x\h\()_neon -push {r4, r5, lr} -ldr r4, [sp, #4 * 3] -mov r5, r4, lsl #6 -mov r4, r1, lsl #2 -sub r4, r1 -sub r0, r4 +ldr r12, [sp] +push{lr} +adr lr, g_luma_s16 +sub r0, r1 +sub r0, r0, r1, lsl #1 // src -= 3 * srcStride +add lr, lr, r12, lsl #4 +vld1.16 {q0}, [lr, :64] // q8 = luma interpolate coeff +vdup.s16d24, d0[0] +vdup.s16d25, d0[1] +vdup.s16d26, d0[2] +vdup.s16d27, d0[3] +vdup.s16d28, d1[0] +vdup.s16d29, d1[1] +vdup.s16d30, d1[2] +vdup.s16d31, d1[3] -mov r4, #32 -vdup.32 q8, r4 -mov r4, #\h +mov r12, #\h + +// prepare to load 8 lines +vld1.u32{d0[0]}, [r0], r1 +vld1.u32{d0[1]}, [r0], r1 +vld1.u32{d2[0]}, [r0], r1 +vld1.u32{d2[1]}, [r0], r1 +vld1.u32{d4[0]}, [r0], r1 +vld1.u32{d4[1]}, [r0], r1 +vld1.u32{d6[0]}, [r0], r1 +vld1.u32{d6[1]}, [r0], r1 +vmovl.u8q0, d0 +vmovl.u8q1, d2 +vmovl.u8q2, d4 +vmovl.u8q3, d6 .loop_4x\h: -movrel r12, g_lumaFilter -add r12, r5 -mov lr, r0 +// TODO: read extra 1 row for speed optimize, may made crash on OS X platform! +vld1.u32{d16[0]}, [r0], r1 +vld1.u32{d16[1]}, [r0], r1 +vmovl.u8q8, d16 -vld1.u32d0[0], [lr], r1 -vld1.u32d0[1], [lr], r1 -vld1.u32d1[0], [lr], r1 -vld1.u32d1[1], [lr], r1 -vld1.u32d2[0], [lr], r1 -vld1.u32d2[1], [lr], r1 -vld1.u32d3[0], [lr], r1 -vld1.u32d3[1], [lr], r1 +// row[0-1] +vmul.s16q9, q0, q12 +vext.64 q11, q0, q1, 1 +vmul.s16q10, q11, q12 +vmovq0, q1 -veor.u8 q9, q9 +// row[2-3] +vmla.s16q9, q1, q13 +vext.64 q11, q1, q2, 1 +vmla.s16q10, q11, q13 +vmovq1, q2 -vmovl.u8q11, d0 -vmovl.u16 q12, d22 -vmovl.u16 q13, d23 -vld1.s32d20, [r12]! -vmov.s32d21, d20 -vmla.s32q9, q12, q10 -vld1.s32d20, [r12]! -vmov.s32d21, d20 -vmla.s32q9, q13, q10 +// row[4-5] +vmla.s16q9, q2, q14 +vext.64 q11, q2, q3, 1 +vmla.s16q10, q11, q14 +vmovq2, q3 -vmovl.u8q11, d1 -vmovl.u16 q12, d22 -vmovl.u16 q13, d23 -vld1.s32d20, [r12]! -vmov.s32d21, d20 -vmla.s32q9, q12, q10 -vld1.s32d20, [r12]! -vmov.s32d21, d20 -vmla.s32q9, q13, q10 +// row[6-7] +vmla.s16q9, q3, q15 +vext.64 q11, q3, q8, 1 +vmla.s16q10, q11, q15 +vmovq3, q8 -vmovl.u8q11, d2 -vmovl.u16 q12, d22 -vmovl.u16 q13, d23 -v
[x265] [PATCH 1 of 2] asm_arm: improve interp_8tap_vert_pp_4xN By: 1. remove unnecessary cache prefetch instructions pld 2. replace register r6 by lr
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1463589653 18000 # Node ID d6990d957a9958a0b128b8a6d5c6a4954af99bbd # Parent 28cf9adfc82e3816189b26aaeb907393b2a82ed8 asm_arm: improve interp_8tap_vert_pp_4xN By: 1. remove unnecessary cache prefetch instructions pld 2. replace register r6 by lr Origin: luma_vpp[ 4x4] 1.87x45.23 84.41 luma_vpp[ 4x8] 2.10x70.36 147.78 luma_vpp[ 4x16] 2.25x121.24 272.18 Optimized: luma_vpp[ 4x4] 1.98x42.42 84.02 luma_vpp[ 4x8] 2.32x63.70 147.49 luma_vpp[ 4x16] 2.51x108.39 272.18 --- source/common/arm/ipfilter8.S | 31 +++ 1 files changed, 11 insertions(+), 20 deletions(-) diff -r 28cf9adfc82e -r d6990d957a99 source/common/arm/ipfilter8.S --- a/source/common/arm/ipfilter8.S Wed May 18 02:01:34 2016 + +++ b/source/common/arm/ipfilter8.S Wed May 18 11:40:53 2016 -0500 @@ -711,7 +711,7 @@ //**luma_vpp .macro LUMA_VPP_4xN h function x265_interp_8tap_vert_pp_4x\h\()_neon -push {r4, r5, r6} +push {r4, r5, lr} ldr r4, [sp, #4 * 3] mov r5, r4, lsl #6 mov r4, r1, lsl #2 @@ -725,24 +725,16 @@ .loop_4x\h: movrel r12, g_lumaFilter add r12, r5 -mov r6, r0 +mov lr, r0 -pld [r6] -vld1.u32d0[0], [r6], r1 -pld [r6] -vld1.u32d0[1], [r6], r1 -pld [r6] -vld1.u32d1[0], [r6], r1 -pld [r6] -vld1.u32d1[1], [r6], r1 -pld [r6] -vld1.u32d2[0], [r6], r1 -pld [r6] -vld1.u32d2[1], [r6], r1 -pld [r6] -vld1.u32d3[0], [r6], r1 -pld [r6] -vld1.u32d3[1], [r6], r1 +vld1.u32d0[0], [lr], r1 +vld1.u32d0[1], [lr], r1 +vld1.u32d1[0], [lr], r1 +vld1.u32d1[1], [lr], r1 +vld1.u32d2[0], [lr], r1 +vld1.u32d2[1], [lr], r1 +vld1.u32d3[0], [lr], r1 +vld1.u32d3[1], [lr], r1 veor.u8 q9, q9 @@ -795,8 +787,7 @@ subsr4, #1 bne .loop_4x\h -pop {r4, r5, r6} -bx lr +pop {r4, r5, pc} .ltorg endfunc .endm ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 2] asm: new macro TRANSPOSE4x4x2_16 to avoid pipeline conflict in combo matrix transpose
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1463075381 18000 # Node ID f880db0a9a9b352077014aa69571d3169a37a2fc # Parent 3e530043698b9df0f9aba7eefbb381ac6cc79421 asm: new macro TRANSPOSE4x4x2_16 to avoid pipeline conflict in combo matrix transpose --- source/common/arm/asm.S | 21 - 1 files changed, 16 insertions(+), 5 deletions(-) diff -r 3e530043698b -r f880db0a9a9b source/common/arm/asm.S --- a/source/common/arm/asm.S Thu May 12 14:53:41 2016 +0530 +++ b/source/common/arm/asm.S Thu May 12 12:49:41 2016 -0500 @@ -175,9 +175,20 @@ vtrn.8 \r2, \r3 .endm -.macro TRANSPOSE4x4_16 d0 d1 d2 d3 -vtrn.32 \d0, \d2 -vtrn.32 \d1, \d3 -vtrn.16 \d0, \d1 -vtrn.16 \d2, \d3 +.macro TRANSPOSE4x4_16 r0, r1, r2, r3 +vtrn.32 \r0, \r2// r0 = [21 20 01 00], r2 = [23 22 03 02] +vtrn.32 \r1, \r3// r1 = [31 30 11 10], r3 = [33 32 13 12] +vtrn.16 \r0, \r1// r0 = [30 20 10 00], r1 = [31 21 11 01] +vtrn.16 \r2, \r3// r2 = [32 22 12 02], r3 = [33 23 13 03] .endm + +.macro TRANSPOSE4x4x2_16 rA0, rA1, rA2, rA3, rB0, rB1, rB2, rB3 +vtrn.32 \rA0, \rA2 // r0 = [21 20 01 00], r2 = [23 22 03 02] +vtrn.32 \rA1, \rA3 // r1 = [31 30 11 10], r3 = [33 32 13 12] +vtrn.32 \rB0, \rB2 +vtrn.32 \rB1, \rB3 +vtrn.16 \rA0, \rA1 // r0 = [30 20 10 00], r1 = [31 21 11 01] +vtrn.16 \rA2, \rA3 // r2 = [32 22 12 02], r3 = [33 23 13 03] +vtrn.16 \rB0, \rB1 +vtrn.16 \rB2, \rB3 +.endm ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2] asm: ARM NEON version of DCT[8x8]
# HG changeset patch # User Min Chen <min.c...@multicorewareinc.com> # Date 1463075384 18000 # Node ID 98fdbf00b01eef722dd804e676c0c81429873cca # Parent f880db0a9a9b352077014aa69571d3169a37a2fc asm: ARM NEON version of DCT[8x8] --- source/common/arm/asm-primitives.cpp |1 + source/common/arm/dct-a.S| 278 ++ source/common/arm/dct8.h |1 + 3 files changed, 280 insertions(+), 0 deletions(-) diff -r f880db0a9a9b -r 98fdbf00b01e source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Thu May 12 12:49:41 2016 -0500 +++ b/source/common/arm/asm-primitives.cpp Thu May 12 12:49:44 2016 -0500 @@ -930,6 +930,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_neon); p.cu[BLOCK_4x4].dct = PFX(dct_4x4_neon); +p.cu[BLOCK_8x8].dct = PFX(dct_8x8_neon); } if (cpuMask & X265_CPU_ARMV6) { diff -r f880db0a9a9b -r 98fdbf00b01e source/common/arm/dct-a.S --- a/source/common/arm/dct-a.S Thu May 12 12:49:41 2016 -0500 +++ b/source/common/arm/dct-a.S Thu May 12 12:49:44 2016 -0500 @@ -120,3 +120,281 @@ bx lr endfunc +.align 4 +ctr4: +.word 83// d0[0] = 83 +.word 36// d0[1] = 36 +ctr8: +.word 75// d1[0] = 75 +.word 89// d1[1] = 89 +.word 18// d2[0] = 18 +.word 50// d2[1] = 50 + + +/* uses registers q4 - q7 for temp values */ +.macro tr4 r0, r1, r2, r3 +vsub.s32q8, \r0, \r3// EO0 +vadd.s32q9, \r0, \r3// EE0 +vadd.s32q10, \r1, \r2 // EE1 +vsub.s32q11, \r1, \r2 // EO1 + +vmul.s32\r1, q8, d0[0] // 83 * EO0 +vmul.s32\r3, q8, d0[1] // 36 * EO0 +vshl.s32q9, q9, #6 // 64 * EE0 +vshl.s32q10, q10, #6// 64 * EE1 +vmla.s32\r1, q11, d0[1] // 83 * EO0 + 36 * EO1 +vmls.s32\r3, q11, d0[0] // 36 * EO0 - 83 * EO1 +vadd.s32\r0, q9, q10// 64 * (EE0 + EE1) +vsub.s32\r2, q9, q10// 64 * (EE0 - EE1) +.endm + + +.macro tr8 r0, r1, r2, r3 +vmul.s32 q12, \r0, d1[1] // 89 * src1 +vmul.s32 q13, \r0, d1[0] // 75 * src1 +vmul.s32 q14, \r0, d2[1] // 50 * src1 +vmul.s32 q15, \r0, d2[0] // 18 * src1 + +vmla.s32 q12, \r1, d1[0] // 75 * src3 +vmls.s32 q13, \r1, d2[0] // -18 * src3 +vmls.s32 q14, \r1, d1[1] // -89 * src3 +vmls.s32 q15, \r1, d2[1] // -50 * src3 + +vmla.s32 q12, \r2, d2[1] // 50 * src5 +vmls.s32 q13, \r2, d1[1] // -89 * src5 +vmla.s32 q14, \r2, d2[0] // 18 * src5 +vmla.s32 q15, \r2, d1[0] // 75 * src5 + +vmla.s32 q12, \r3, d2[0] // 18 * src7 +vmls.s32 q13, \r3, d2[1] // -50 * src7 +vmla.s32 q14, \r3, d1[0] // 75 * src7 +vmls.s32 q15, \r3, d1[1] // -89 * src7 +.endm + + +// TODO: in the DCT-2D stage, I spending 4x8=32 LD/ST operators because I haven't temporary buffer +/* void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */ +function x265_dct_8x8_neon +vpush {q4-q7} + +mov r2, r2, lsl #1 + +adr r3, ctr4 +vld1.16 {d0-d2}, [r3] +mov r3, r1 + +// DCT-1D +// top half +vld1.16 {q12}, [r0], r2 +vld1.16 {q13}, [r0], r2 +vld1.16 {q14}, [r0], r2 +vld1.16 {q15}, [r0], r2 + +TRANSPOSE4x4x2_16 d24, d26, d28, d30, d25, d27, d29, d31 + +// |--| +// |24| +// |26| +// |28| +// |30| +// |25| +// |27| +// |29| +// |31| +// |--| + +vaddl.s16 q4, d28, d27 +vaddl.s16 q5, d30, d25 +vaddl.s16 q2, d24, d31 +vaddl.s16 q3, d26, d29 + +tr4 q2, q3, q4, q5 + +vqrshrn.s32 d20, q3, 2 +vqrshrn.s32 d16, q2, 2 +vqrshrn.s32 d17, q4, 2 +vqrshrn.s32 d21, q5, 2 + +vsubl.s16 q2, d24, d31 +vsubl.s16 q3, d26, d29 +vsubl.s16 q4, d28, d27 +vsubl.s16 q5, d30, d25 + +tr8 q2, q3, q4, q5 + +vqrshrn.s32 d18, q12, 2 +vqrshrn.s32 d22, q13, 2 +vqrshrn.s32 d19, q14, 2 +vqrshrn.s32 d23, q15, 2 + +vstm r1!, {d16-d23] + +// bottom half +vld1.16 {q12}, [r0], r2 +vld1.16 {q13}, [r0], r2 +vld1.16 {q14}, [r0], r2 +vld1.16 {q15}, [r0], r2 +mov r2, #8*2 + +TRANSPOSE4x4x2_16 d24, d26, d28, d30, d25, d27, d29, d31 + +// |--| +// |24| +// |26| +// |28| +// |30| +// |25| +// |27| +// |29| +// |31| +// |--| + +vaddl.s16 q4, d28, d27 +vaddl.s16 q5, d30, d25 +vaddl.s16 q2, d24, d31 +vaddl.s16 q3, d26, d29 + +tr4 q2, q3, q4, q5 + +vqrshrn.s32 d20, q3, 2 +vqrshrn.s32 d16, q2, 2 +vqrshrn.s32 d17, q4, 2 +vqrshrn.s32 d21, q5, 2 + +vsubl.s16 q2, d24, d31 +vsubl.s16 q3, d26, d29 +vsubl.s16 q4, d28, d27 +vsubl.s16 q5, d30, d25 + +tr8 q2, q3, q4, q5 + +vqrshrn.s32 d18, q12, 2 +vqrshrn.s32 d22, q13, 2 +vqrshrn.s32 d19, q14, 2 +vqrshrn.s32 d23, q15, 2 + +vstm r1, {d16-d23] +
[x265] [PATCH] asm: ARM NEON version of DCT[4x4]
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1461651296 -19800 # Node ID 0ca4769256c992c7fcef3d9110cb113e0ce88b56 # Parent 19cced21060f71e8efe5f2544ccb14f9273fd93c asm: ARM NEON version of DCT[4x4] diff -r 19cced21060f -r 0ca4769256c9 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Tue Apr 26 15:06:55 2016 -0700 +++ b/source/common/CMakeLists.txt Tue Apr 26 11:44:56 2016 +0530 @@ -89,7 +89,7 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here -set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S) +set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") diff -r 19cced21060f -r 0ca4769256c9 source/common/arm/asm-primitives.cpp --- a/source/common/arm/asm-primitives.cpp Tue Apr 26 15:06:55 2016 -0700 +++ b/source/common/arm/asm-primitives.cpp Tue Apr 26 11:44:56 2016 +0530 @@ -34,6 +34,7 @@ #include "pixel.h" #include "pixel-util.h" #include "ipfilter8.h" +#include "dct8.h" } namespace X265_NS { @@ -820,6 +821,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_neon); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_neon); +p.cu[BLOCK_4x4].dct = PFX(dct_4x4_neon); } if (cpuMask & X265_CPU_ARMV6) { diff -r 19cced21060f -r 0ca4769256c9 source/common/arm/dct-a.S --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/source/common/arm/dct-a.S Tue Apr 26 11:44:56 2016 +0530 @@ -0,0 +1,122 @@ +/***** + * Copyright (C) 2016 x265 project + * + * Authors: Min Chen <chenm...@163.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + */ + +#include "asm.S" + +.section .rodata + +.align 4 + +.text + +.align 4 + +//dst[0 * line] = ((64 * E[0] + 64 * E[1] + add) >> shift); +//dst[2 * line] = ((64 * E[0] - 64 * E[1] + add) >> shift); +//dst[1 * line] = ((83 * O[0] + 36 * O[1] + add) >> shift); +//dst[3 * line] = ((36 * O[0] - 83 * O[1] + add) >> shift); + +/* void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */ +function x265_dct_4x4_neon +mov r2, r2, lsl #1 +vld1.16 {d0}, [r0, :64], r2 // d0 = [03 02 01 00] +vld1.16 {d1}, [r0, :64], r2 // d1 = [13 12 11 10] +vld1.16 {d2}, [r0, :64], r2 // d2 = [23 22 21 20] +vld1.16 {d3}, [r0, :64] // d3 = [33 32 31 30] + +vtrn.32 q0, q1 // q0 = [31 30 11 10 21 20 01 00], q1 = [33 32 13 12 23 22 03 02] +vrev32.16 q1, q1 // q1 = [32 33 12 13 22 23 02 03] + +movconstr0, 0x00240053 +movconstr2, 0xFFAD0024 + +// DCT-1D +vadd.s16q2, q0, q1 // q2 = [E31 E30 E11 E10 E21 E20 E01 E00] +vsub.s16q3, q0, q1 // q3 = [O31 O30 O11 O10 O21 O20 O01 O00] +vdup.32 d16, r0 // d16 = [ 36 83] +vdup.32 d17, r2 // d17 = [-83 36] +vtrn.16 d4, d5 // d4 = [E30 E20 E10 E00], d5 = [E31 E21 E11 E01] +vtrn.32 d6, d7 // q3 = [O31 O30 O21 O20 O11 O10 O01 O00] + +vmull.s16 q9, d6, d16 +vmull.s16 q10, d7, d16// [q9, q10] = [ 36*O1 83*O0] -> [1] +vmull.s16 q11, d6, d17 +vmull.s16 q12, d7, d17// [q11,q12] = [-83*O1 36*O0] ->
[x265] [PATCH 3 of 3] asm: rewrite interpolate hps width of [32, 48, 64], improve ~20%
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1460482251 18000 # Node ID f74e220607e15ea4c00645e59055996767303aaa # Parent 37e80d50caf51a74e85c83f24317935171a5d375 asm: rewrite interpolate hps width of [32,48,64], improve ~20% OLD: luma_hps[32x32] 6.32x16429.69103771.02 luma_hps[32x16] 6.04x10121.5661140.21 luma_hps[32x64] 6.47x30813.70199438.95 luma_hps[32x24] 6.23x13277.2682747.75 luma_hps[48x64] 6.13x46002.25282176.44 luma_hps[64x64] 6.15x61393.88377670.03 luma_hps[64x32] 6.79x33001.77224096.58 luma_hps[64x48] 6.21x47242.66293529.16 luma_hps[64x16] 6.51x19207.61125016.56 NEW: luma_hps[32x32] 7.66x13404.22102730.96 luma_hps[32x16] 7.32x8355.57 61133.25 luma_hps[32x64] 7.68x24496.17188086.11 luma_hps[32x24] 8.00x10879.0987077.93 luma_hps[48x64] 7.62x37094.37282758.94 luma_hps[64x64] 7.82x48535.86379390.78 luma_hps[64x32] 7.91x26512.17209755.50 luma_hps[64x48] 8.06x37020.63298498.28 luma_hps[64x16] 7.95x15479.03123132.41 --- source/common/x86/ipfilter16.asm | 100 +++--- 1 files changed, 50 insertions(+), 50 deletions(-) diff -r 37e80d50caf5 -r f74e220607e1 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Apr 12 12:30:48 2016 -0500 +++ b/source/common/x86/ipfilter16.asm Tue Apr 12 12:30:51 2016 -0500 @@ -116,6 +116,7 @@ dw -1, 4, -11, 40, 40, -11, 4, -1 dw 0, 1, -5, 17, 58, -10, 4, -1 +ALIGN 32 tab_LumaCoeffV: times 4 dw 0, 0 times 4 dw 0, 64 times 4 dw 0, 0 @@ -161,9 +162,8 @@ const interp8_hpp_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 -const pb_shuf, db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 -db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 - +const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 +db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 SECTION .text cextern pd_8 @@ -10407,7 +10407,7 @@ vpbroadcastqm0, [tab_LumaCoeff + r4] vpbroadcastqm1, [tab_LumaCoeff + r4 + 8] %endif -movam3, [pb_shuf] +movam3, [interp8_hpp_shuf] vbroadcasti128 m2, [INTERP_OFFSET_PS] ; register map @@ -10475,7 +10475,7 @@ vpbroadcastqm0, [tab_LumaCoeff + r4] vpbroadcastqm1, [tab_LumaCoeff + r4 + 8] %endif -movam3, [pb_shuf] +movam3, [interp8_hpp_shuf] vbroadcasti128 m2, [INTERP_OFFSET_PS] ; register map @@ -10536,16 +10536,16 @@ add r3d, r3d mov r4d, r4m mov r5d, r5m -shl r4d, 4 -%ifdef PIC -lea r6, [tab_LumaCoeff] -vpbroadcastqm0, [r6 + r4] -vpbroadcastqm1, [r6 + r4 + 8] -%else -vpbroadcastqm0, [tab_LumaCoeff + r4] -vpbroadcastqm1, [tab_LumaCoeff + r4 + 8] -%endif -movam3, [pb_shuf] +shl r4d, 6 +%ifdef PIC +lea r6, [tab_LumaCoeffV] +movum0, [r6 + r4] +movum1, [r6 + r4 + mmsize] +%else +movum0, [tab_LumaCoeffV + r4] +movum1, [tab_LumaCoeffV + r4 + mmsize] +%endif +movam3, [interp8_hpp_shuf_new] vbroadcasti128 m2, [INTERP_OFFSET_PS] ; register map @@ -10554,7 +10554,7 @@ sub r0, 6 testr5d, r5d mov r4d, %2 -jz .loop0 +jz .loop0 lea r6, [r1*3] sub r0, r6 add r4d, 7 @@ -10563,64 +10563,64 @@ %assign x 0 %rep %1/16 vbroadcasti128 m4, [r0 + x] -vbroadcasti128 m5, [r0 + 8 + x] +vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x] pshufb m4, m3 -pshufb m7, m5, m3 +pshufb m5, m3 pmaddwd m4, m0 -pmaddwd m7, m1 +pmaddwd m7, m5, m1 paddd m4, m7 +vextracti128xm7, m4, 1 +paddd xm4, xm7 +paddd xm4, xm2 +psrad xm4, INTERP_SHIFT_PS vbroadcasti128 m6, [r0 + 16 + x] +pshufb m6, m3 + +pmaddwd m5, m0 +pmaddwd m7, m6, m1 +paddd m5, m7 +vextra
[x265] [PATCH 2 of 3] asm: AVX2 version of sa8d[32x32]
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1460482248 18000 # Node ID 37e80d50caf51a74e85c83f24317935171a5d375 # Parent 40afead3177d7c128066334bfe075042388e86b0 asm: AVX2 version of sa8d[32x32] AVX: sa8d[32x32] 5.47x7403.68 40490.18 AVX2: sa8d[32x32] 10.57x 3783.80 40001.89 --- source/common/x86/asm-primitives.cpp |1 + source/common/x86/pixel-a.asm| 369 ++ 2 files changed, 370 insertions(+), 0 deletions(-) diff -r 40afead3177d -r 37e80d50caf5 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sat Apr 09 19:32:28 2016 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Apr 12 12:30:48 2016 -0500 @@ -2161,6 +2161,7 @@ p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2); p.cu[LUMA_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2); p.cu[LUMA_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2); +p.cu[LUMA_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2); #endif if (cpuMask & X265_CPU_BMI2) diff -r 40afead3177d -r 37e80d50caf5 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Sat Apr 09 19:32:28 2016 +0530 +++ b/source/common/x86/pixel-a.asm Tue Apr 12 12:30:48 2016 -0500 @@ -13995,4 +13995,373 @@ shr eax, 1 RET + +; TODO: optimize me, need more 2 of YMM registers because C model get partial result every 16x16 block +INIT_YMM avx2 +cglobal pixel_sa8d_32x32, 4,8,14 +FIX_STRIDES r1, r3 +lea r4, [3*r1] +lea r5, [3*r3] +lea r6, [r0+4*r1] +lea r7, [r2+4*r3] +vbroadcasti128 m7, [pw_1] + + +;SA8D[16x8] ; pix[0] +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] +movu m5, [r2] +psubw m0, m5 +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psubw m3, m11 +movu m9, [r6 + r4] +movu m6, [r7 + r5] +psubw m9, m6 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +paddw m0, m1 +paddw m2, m8 +pmaddwd m0, m7 +pmaddwd m2, m7 +paddd m10, m0, m2 + + +; SA8D[16x8] ; pix[16] +add r0, mmsize +add r2, mmsize +add r6, mmsize +add r7, mmsize + +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] +movu m5, [r2] +psubw m0, m5 +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psubw m3, m11 +movu m9, [r6 + r4] +movu m6, [r7 + r5] +psubw m9, m6 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +paddw m0, m1 +paddw m2, m8 +pmaddwd m0, m7 +pmaddwd m2, m7 +paddd m12, m0, m2 + + +; SA8D[16x8] ; pix[8*stride+16] +lea r0, [r0+8*r1] +lea r2, [r2+8*r3] +lea r6, [r6+8*r1] +lea r7, [r7+8*r3] + +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] +movu m5, [r2] +psubw m0, m5 +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psubw m3, m11 +movu m9, [r6 + r4] +movu m6, [r7 + r5] +psubw m9, m6 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +paddw m0, m1 +paddw m2, m8 +pmaddwd m0, m7 +pmaddwd m2, m7 +paddd m12, m0 +paddd m12, m2 + +; sum[1] +HADDD m12, m0 + + +; SA8D[16x8] ; pix[8*stride] +sub r0, mmsize +sub r2, mmsize +sub r6, mmsize +sub r7, mmsize + +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] +movu m5, [r2] +psubw m0, m5 +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psu
[x265] [PATCH 1 of 3] doc: update tune grain documentation
# HG changeset patch # User Deepthi Nandakumar# Date 1460210548 -19800 # Node ID 40afead3177d7c128066334bfe075042388e86b0 # Parent 31a417fa69ce37a76e41c203a017686fe7f73877 doc: update tune grain documentation diff -r 31a417fa69ce -r 40afead3177d doc/reST/presets.rst --- a/doc/reST/presets.rst Sat Apr 09 19:06:56 2016 +0530 +++ b/doc/reST/presets.rst Sat Apr 09 19:32:28 2016 +0530 @@ -121,29 +121,22 @@ :option:`--tune` *grain* tries to improve the retention of film grain in -the reconstructed output. It disables rate distortion optimizations in -quantization, and increases the default psy-rd. +the reconstructed output. Varying the quantization parameter within and +across frames causes grain strobing (uneven distribution of grain), which +is visually distracting. :option: `--tune` *grain* severely dials down +algorithms that vary the quantization parameter. -* :option:`--psy-rd` 0.5 -* :option:`--rdoq-level` 0 -* :option:`--psy-rdoq` 0 +* :option:`--aq-mode` 0 +* :option:`--cu-tree` 0 +* :option:`--ip-factor` 1.1 + * :option:`--pb-factor` 1.0 + * :option:`--qp-step` 1 -It lowers the strength of adaptive quantization, so residual energy can -be more evenly distributed across the (noisy) picture: - -* :option:`--aq-strength` 0.3 - -And it similarly tunes rate control to prevent the slice QP from -swinging too wildly from frame to frame: - -* :option:`--ipratio` 1.1 -* :option:`--pbratio` 1.1 -* :option:`--qcomp` 0.8 - -And lastly it reduces the strength of deblocking to prevent grain being -blurred on block boundaries: - -* :option:`--deblock` -2 +It also enables a specialised ratecontrol algorithm :option:`--rc-grain` +that strictly minimises QP fluctuations across frames, while still allowing +the encoder to hit bitrate targets and VBV buffer limits (with a slightly +higher margin of error than normal). It is highly recommended that this +algorithm is used only through the :option:`--tune` *grain* feature. Fast Decode ~~~ ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] fix threading conflict in low resolution video (Issue #260)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1460071560 18000 # Node ID 1253c2bdb4030bbb3d84422b6068b5f4942571e9 # Parent 5b01678f6fb4e89e23cd41295592a9aa5d51d4ba fix threading conflict in low resolution video (Issue #260) The threading conflict because video resolution too low, it made threading approach finish in same time. The root cause in our sync logic, we relase all of filter sync-lock in latest column processed, I give more details in below. Time 0: Row0 - request assign work threading Row1 - request assign work threading Time 1: Row0 - assign threading failure, and will continue in FrameFilter::processRow(). Row1 - got a threading (since all of current row cu encode finished, the allowCol will setting to latest column, it means no restrict on sync logic) Time 2: Row1 - threading process beyond Row0 bound --> Crash here diff -r 5b01678f6fb4 -r 1253c2bdb403 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppSat Apr 02 19:08:49 2016 +0100 +++ b/source/encoder/framefilter.cppThu Apr 07 18:26:00 2016 -0500 @@ -320,11 +320,14 @@ const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap; PicYuv* reconPic = m_encData->m_reconPic; const int colStart = m_lastCol.get(); +const int numCols = m_frameFilter->m_numCols; // TODO: Waiting previous row finish or simple clip on it? -const int colEnd = m_allowedCol.get(); -const int numCols = m_frameFilter->m_numCols; +int colEnd = m_allowedCol.get(); // Avoid threading conflict +if (m_prevRow && colEnd > m_prevRow->m_lastDeblocked.get()) +colEnd = m_prevRow->m_lastDeblocked.get(); + if (colStart >= colEnd) return; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 2 updated] asm: new AVX2 version of satd_8x8 (509c -> 307c)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1459377415 18000 # Node ID ed0ed12c8359f77acdd85f7443bd4cd7bc1ba16e # Parent 5dbd6a0c8e17481a0c4d31243ebc8b46ad59e15d asm: new AVX2 version of satd_8x8 (509c -> 307c) --- source/common/x86/asm-primitives.cpp |4 ++ source/common/x86/pixel-a.asm| 74 ++ 2 files changed, 78 insertions(+), 0 deletions(-) diff -r 5dbd6a0c8e17 -r ed0ed12c8359 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Mar 28 12:53:40 2016 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Mar 30 17:36:55 2016 -0500 @@ -2157,6 +2157,10 @@ ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); // calling luma_hvpp for all sizes p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu; // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] +#if X265_DEPTH == 10 +p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2); +#endif + if (cpuMask & X265_CPU_BMI2) { p.scanPosLast = PFX(scanPosLast_avx2_bmi2); diff -r 5dbd6a0c8e17 -r ed0ed12c8359 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Mar 28 12:53:40 2016 +0530 +++ b/source/common/x86/pixel-a.asm Wed Mar 30 17:36:55 2016 -0500 @@ -13799,3 +13799,77 @@ movzx eax, al RET %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 + + +%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 +%macro LOAD_DIFF_AVX2 4 +movu %1, %3 +movu %2, %4 +psubw %1, %2 +%endmacro + +%macro LOAD_DIFF_8x4P_AVX2 7-9 r0,r2 ; 4x dest, 2x temp, 2x pointer +LOAD_DIFF_AVX2 xm%1, xm%5, [%8], [%9] +LOAD_DIFF_AVX2 xm%2, xm%6, [%8+r1], [%9+r3] +LOAD_DIFF_AVX2 xm%3, xm%5, [%8+2*r1], [%9+2*r3] +LOAD_DIFF_AVX2 xm%4, xm%6, [%8+r4], [%9+r5] + +lea %8, [%8+4*r1] +lea %9, [%9+4*r3] +%endmacro + +%macro SATD_8x4_AVX2 8-9 +HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax +paddw m%8, m%2 +paddw m%8, m%4 +%endmacro + +INIT_YMM avx2 +cglobal pixel_satd_8x8, 4,4,7 + +FIX_STRIDES r1, r3 +pxorxm6, xm6 + +; load_diff 0 & 4 +movuxm0, [r0] +movuxm1, [r2] +vinserti128 m0, m0, [r0 + r1 * 4], 1 +vinserti128 m1, m1, [r2 + r3 * 4], 1 +psubw m0, m1 +add r0, r1 +add r2, r3 + +; load_diff 1 & 5 +movuxm1, [r0] +movuxm2, [r2] +vinserti128 m1, m1, [r0 + r1 * 4], 1 +vinserti128 m2, m2, [r2 + r3 * 4], 1 +psubw m1, m2 +add r0, r1 +add r2, r3 + +; load_diff 2 & 6 +movuxm2, [r0] +movuxm3, [r2] +vinserti128 m2, m2, [r0 + r1 * 4], 1 +vinserti128 m3, m3, [r2 + r3 * 4], 1 +psubw m2, m3 +add r0, r1 +add r2, r3 + +; load_diff 3 & 7 +movuxm3, [r0] +movuxm4, [r2] +vinserti128 m3, m3, [r0 + r1 * 4], 1 +vinserti128 m4, m4, [r2 + r3 * 4], 1 +psubw m3, m4 + +SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 + +vextracti128 xm0, m6, 1 +paddw xm6, xm0 +HADDUW xm6, xm0 +movd eax, xm6 +RET + +%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2 updated] asm: new AVX2 version of sa8d[8x8, 16x16]
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1459377417 18000 # Node ID 9f841737a0f755f4d13e1db4e5b19cbebd05130a # Parent ed0ed12c8359f77acdd85f7443bd4cd7bc1ba16e asm: new AVX2 version of sa8d[8x8, 16x16] AVX: sa8d[8x8]4.82x517.79 2493.20 sa8d[16x16] 5.65x 1952.40 11039.93 AVX2: sa8d[8x8]5.13x489.15 2507.44 sa8d[16x16] 10.27x 1006.08 11206.09 --- source/common/x86/asm-primitives.cpp |2 + source/common/x86/pixel-a.asm| 149 +++--- 2 files changed, 138 insertions(+), 13 deletions(-) diff -r ed0ed12c8359 -r 9f841737a0f7 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Mar 30 17:36:55 2016 -0500 +++ b/source/common/x86/asm-primitives.cpp Wed Mar 30 17:36:57 2016 -0500 @@ -2159,6 +2159,8 @@ #if X265_DEPTH == 10 p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2); +p.cu[LUMA_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2); +p.cu[LUMA_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2); #endif if (cpuMask & X265_CPU_BMI2) diff -r ed0ed12c8359 -r 9f841737a0f7 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Wed Mar 30 17:36:55 2016 -0500 +++ b/source/common/x86/pixel-a.asm Wed Mar 30 17:36:57 2016 -0500 @@ -13808,20 +13808,14 @@ psubw %1, %2 %endmacro -%macro LOAD_DIFF_8x4P_AVX2 7-9 r0,r2 ; 4x dest, 2x temp, 2x pointer -LOAD_DIFF_AVX2 xm%1, xm%5, [%8], [%9] -LOAD_DIFF_AVX2 xm%2, xm%6, [%8+r1], [%9+r3] -LOAD_DIFF_AVX2 xm%3, xm%5, [%8+2*r1], [%9+2*r3] -LOAD_DIFF_AVX2 xm%4, xm%6, [%8+r4], [%9+r5] - -lea %8, [%8+4*r1] -lea %9, [%9+4*r3] -%endmacro - -%macro SATD_8x4_AVX2 8-9 -HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax -paddw m%8, m%2 -paddw m%8, m%4 +%macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer +LOAD_DIFF_AVX2 xm%1, xm%5, [%7], [%8] +LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1], [%8+r3] +LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3] +LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4], [%8+r5] + +;lea %7, [%7+4*r1] +;lea %8, [%8+4*r3] %endmacro INIT_YMM avx2 @@ -13872,4 +13866,133 @@ movd eax, xm6 RET +INIT_XMM avx2 +cglobal pixel_sa8d_8x8_internal +lea r6, [r0+4*r1] +lea r7, [r2+4*r3] +LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +;HADAMARD2_2D 0, 1, 2, 8, 6, wd +;HADAMARD2_2D 4, 5, 3, 9, 6, wd +;HADAMARD2_2D 0, 2, 1, 8, 6, dq +;HADAMARD2_2D 4, 3, 5, 9, 6, dq +;HADAMARD2_2D 0, 4, 2, 3, 6, qdq, amax +;HADAMARD2_2D 1, 5, 8, 9, 6, qdq, amax + +paddw m0, m1 +paddw m0, m2 +paddw m0, m8 +SAVE_MM_PERMUTATION +ret + + +INIT_XMM avx2 +cglobal pixel_sa8d_8x8, 4,8,12 +FIX_STRIDES r1, r3 +lea r4, [3*r1] +lea r5, [3*r3] +call pixel_sa8d_8x8_internal +HADDUW m0, m1 +movd eax, m0 +add eax, 1 +shr eax, 1 +RET + + +INIT_YMM avx2 +cglobal pixel_sa8d_16x16, 4,8,12 +FIX_STRIDES r1, r3 +lea r4, [3*r1] +lea r5, [3*r3] +lea r6, [r0+4*r1] +lea r7, [r2+4*r3] +vbroadcasti128 m7, [pw_1] + +;call pixel_sa8d_8x8_internal ; pix[0] +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] +movu m5, [r2] +psubw m0, m5 +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psubw m3, m11 +movu m9, [r6 + r4] +movu m6, [r7 + r5] +psubw m9, m6 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +paddw m0, m1 +paddw m2, m8 +pmaddwd m0, m7 +pmaddwd m2, m7 +paddd m10, m0, m2 + +lea r0, [r0+8*r1] +lea r2, [r2+8*r3] +lea r6, [r6+8*r1] +lea r7, [r7+8*r3] + +;call pixel_sa8d_8x8_internal ; pix[8*stride+8] +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] +movu m5, [r2] +psubw m0, m5 +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psubw m3, m11 +movu m9, [r6 + r4] +movu m6, [r7 + r5] +psubw m9, m6 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +paddw m0, m1 +paddw m2, m8 +pmaddwd m0, m7 +pmaddwd m2, m7
[x265] [PATCH 2 of 2] asm: new AVX2 version of sa8d[8x8, 16x16]
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1459209477 18000 # Node ID 08463d2a7699df3e33c9a212650fa76e8c9e962e # Parent b6c356991e265118bba3ce7581f301d84eece600 asm: new AVX2 version of sa8d[8x8, 16x16] AVX: sa8d[8x8]4.82x517.79 2493.20 sa8d[16x16] 5.65x 1952.40 11039.93 AVX2: sa8d[8x8]5.13x489.15 2507.44 sa8d[16x16] 10.27x 1006.08 11206.09 diff -r b6c356991e26 -r 08463d2a7699 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Mar 28 18:28:20 2016 -0500 +++ b/source/common/x86/asm-primitives.cpp Mon Mar 28 18:57:57 2016 -0500 @@ -2159,6 +2159,8 @@ #if X265_DEPTH == 10 p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2); +p.cu[LUMA_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2); +p.cu[LUMA_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2); #endif if (cpuMask & X265_CPU_BMI2) diff -r b6c356991e26 -r 08463d2a7699 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Mar 28 18:28:20 2016 -0500 +++ b/source/common/x86/pixel-a.asm Mon Mar 28 18:57:57 2016 -0500 @@ -13808,20 +13808,14 @@ psubw %1, %2 %endmacro -%macro LOAD_DIFF_8x4P_AVX2 7-9 r0,r2 ; 4x dest, 2x temp, 2x pointer -LOAD_DIFF_AVX2 xm%1, xm%5, [%8], [%9] -LOAD_DIFF_AVX2 xm%2, xm%6, [%8+r1], [%9+r3] -LOAD_DIFF_AVX2 xm%3, xm%5, [%8+2*r1], [%9+2*r3] -LOAD_DIFF_AVX2 xm%4, xm%6, [%8+r4], [%9+r5] - -lea %8, [%8+4*r1] -lea %9, [%9+4*r3] -%endmacro - -%macro SATD_8x4_AVX2 8-9 -HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax -paddw m%8, m%2 -paddw m%8, m%4 +%macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer +LOAD_DIFF_AVX2 xm%1, xm%5, [%7], [%8] +LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1], [%8+r3] +LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3] +LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4], [%8+r5] + +;lea %7, [%7+4*r1] +;lea %8, [%8+4*r3] %endmacro INIT_YMM avx2 @@ -13872,4 +13866,133 @@ movd eax, xm6 RET +INIT_XMM avx2 +cglobal pixel_sa8d_8x8_internal +lea r6, [r0+4*r1] +lea r7, [r2+4*r3] +LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +;HADAMARD2_2D 0, 1, 2, 8, 6, wd +;HADAMARD2_2D 4, 5, 3, 9, 6, wd +;HADAMARD2_2D 0, 2, 1, 8, 6, dq +;HADAMARD2_2D 4, 3, 5, 9, 6, dq +;HADAMARD2_2D 0, 4, 2, 3, 6, qdq, amax +;HADAMARD2_2D 1, 5, 8, 9, 6, qdq, amax + +paddw m0, m1 +paddw m0, m2 +paddw m0, m8 +SAVE_MM_PERMUTATION +ret + + +INIT_XMM avx2 +cglobal pixel_sa8d_8x8, 4,8,12 +FIX_STRIDES r1, r3 +lea r4, [3*r1] +lea r5, [3*r3] +call pixel_sa8d_8x8_internal +HADDUW m0, m1 +movd eax, m0 +add eax, 1 +shr eax, 1 +RET + + +INIT_YMM avx2 +cglobal pixel_sa8d_16x16, 4,8,12 +FIX_STRIDES r1, r3 +lea r4, [3*r1] +lea r5, [3*r3] +lea r6, [r0+4*r1] +lea r7, [r2+4*r3] +vbroadcasti128 m7, [pw_1] + +;call pixel_sa8d_8x8_internal ; pix[0] +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] +movu m5, [r2] +psubw m0, m5 +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psubw m3, m11 +movu m9, [r6 + r4] +movu m6, [r7 + r5] +psubw m9, m6 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +paddw m0, m1 +paddw m2, m8 +pmaddwd m0, m7 +pmaddwd m2, m7 +paddd m10, m0, m2 + +lea r0, [r0+8*r1] +lea r2, [r2+8*r3] +lea r6, [r6+8*r1] +lea r7, [r7+8*r1] + +;call pixel_sa8d_8x8_internal ; pix[8*stride+8] +;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2 +movu m0, [r0] +movu m5, [r2] +psubw m0, m5 +movu m1, [r0 + r1] +movu m6, [r2 + r3] +psubw m1, m6 +movu m2, [r0 + r1 * 2] +movu m5, [r2 + r3 * 2] +psubw m2, m5 +movu m8, [r0 + r4] +movu m6, [r2 + r5] +psubw m8, m6 + +;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7 +movu m4, [r6] +movu m11, [r7] +psubw m4, m11 +movu m5, [r6 + r1] +movu m6, [r7 + r3] +psubw m5, m6 +movu m3, [r6 + r1 * 2] +movu m11, [r7 + r3 * 2] +psubw m3, m11 +movu m9, [r6 + r4] +movu m6, [r7 + r5] +psubw m9, m6 + +HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax +paddw m0, m1 +paddw m2, m8 +pmaddwd m0, m7 +pmaddwd m2, m7 +paddd m10, m0 +paddd m10, m2 + +HADDD m10, m0 + +movd eax, xm10 +add eax, 1 +shr eax, 1 +RET + %endif ; HIGH_BIT_DEPTH == 1 &&am
[x265] [PATCH 1 of 2] asm: new AVX2 version of satd_8x8 (509c -> 307c)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1459207700 18000 # Node ID b6c356991e265118bba3ce7581f301d84eece600 # Parent 5dbd6a0c8e17481a0c4d31243ebc8b46ad59e15d asm: new AVX2 version of satd_8x8 (509c -> 307c) --- source/common/x86/asm-primitives.cpp |4 ++ source/common/x86/pixel-a.asm| 74 ++ 2 files changed, 78 insertions(+), 0 deletions(-) diff -r 5dbd6a0c8e17 -r b6c356991e26 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Mar 28 12:53:40 2016 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Mar 28 18:28:20 2016 -0500 @@ -2157,6 +2157,10 @@ ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); // calling luma_hvpp for all sizes p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu; // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] +#if X265_DEPTH == 10 +p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2); +#endif + if (cpuMask & X265_CPU_BMI2) { p.scanPosLast = PFX(scanPosLast_avx2_bmi2); diff -r 5dbd6a0c8e17 -r b6c356991e26 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Mar 28 12:53:40 2016 +0530 +++ b/source/common/x86/pixel-a.asm Mon Mar 28 18:28:20 2016 -0500 @@ -13799,3 +13799,77 @@ movzx eax, al RET %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0 + + +%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 +%macro LOAD_DIFF_AVX2 4 +movu %1, %3 +movu %2, %4 +psubw %1, %2 +%endmacro + +%macro LOAD_DIFF_8x4P_AVX2 7-9 r0,r2 ; 4x dest, 2x temp, 2x pointer +LOAD_DIFF_AVX2 xm%1, xm%5, [%8], [%9] +LOAD_DIFF_AVX2 xm%2, xm%6, [%8+r1], [%9+r3] +LOAD_DIFF_AVX2 xm%3, xm%5, [%8+2*r1], [%9+2*r3] +LOAD_DIFF_AVX2 xm%4, xm%6, [%8+r4], [%9+r5] + +lea %8, [%8+4*r1] +lea %9, [%9+4*r3] +%endmacro + +%macro SATD_8x4_AVX2 8-9 +HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax +paddw m%8, m%2 +paddw m%8, m%4 +%endmacro + +INIT_YMM avx2 +cglobal pixel_satd_8x8, 4,4,7 + +FIX_STRIDES r1, r3 +pxorxm6, xm6 + +; load_diff 0 & 4 +movuxm0, [r0] +movuxm1, [r2] +vinserti128 m0, m0, [r0 + r1 * 4], 1 +vinserti128 m1, m1, [r2 + r3 * 4], 1 +psubw m0, m1 +add r0, r1 +add r2, r3 + +; load_diff 1 & 5 +movuxm1, [r0] +movuxm2, [r2] +vinserti128 m1, m1, [r0 + r1 * 4], 1 +vinserti128 m2, m2, [r2 + r3 * 4], 1 +psubw m1, m2 +add r0, r1 +add r2, r3 + +; load_diff 2 & 6 +movuxm2, [r0] +movuxm3, [r2] +vinserti128 m2, m2, [r0 + r1 * 4], 1 +vinserti128 m3, m3, [r2 + r3 * 4], 1 +psubw m2, m3 +add r0, r1 +add r2, r3 + +; load_diff 3 & 7 +movuxm3, [r0] +movuxm4, [r2] +vinserti128 m3, m3, [r0 + r1 * 4], 1 +vinserti128 m4, m4, [r2 + r3 * 4], 1 +psubw m3, m4 + +SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 + +vextracti128 xm0, m6, 1 +paddw xm6, xm0 +HADDUW xm6, xm0 +movd eax, xm6 +RET + +%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: high_bit_depth sse4 version of saoCuStatsE2 & saoCuStatsE3
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1455859663 -28800 # Node ID 5cdbd129c0d840669758a11597a52aa53f0fcbfa # Parent c2228fb8151ddce111a75fb1c02b25eca5a68604 asm: high_bit_depth sse4 version of saoCuStatsE2 & saoCuStatsE3 diff -r c2228fb8151d -r 5cdbd129c0d8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Feb 19 09:50:42 2016 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Feb 19 13:27:43 2016 +0800 @@ -1169,6 +1169,8 @@ #if X86_64 p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4); p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4); +p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4); +p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4); #endif } if (cpuMask & X265_CPU_AVX) diff -r c2228fb8151d -r 5cdbd129c0d8 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Fri Feb 19 09:50:42 2016 +0530 +++ b/source/common/x86/loopfilter.asm Fri Feb 19 13:27:43 2016 +0800 @@ -2872,6 +2872,129 @@ ;} %if ARCH_X86_64 + +%if HIGH_BIT_DEPTH == 1 +INIT_XMM sse4 +cglobal saoCuStatsE2, 5,9,7,0-32; Stack: 5 of stats and 5 of count +mov r5d, r5m +FIX_STRIDES r2d + +; clear internal temporary buffer +pxorm0, m0 +mova[rsp], m0 +mova[rsp + mmsize], m0 +movam5, [pw_1] +movam6, [pb_2] + +.loopH: +; TODO: merge into SIMD in below +; get upBuffX[0] +mov r6w, [r1 + r2] +sub r6w, [r1 - 1 * SIZEOF_PIXEL] +setar6b +setbr7b +sub r6b, r7b +mov [r4], r6b + +; backup unavailable pixels +movhm0, [r4 + r5 + 1] + +mov r6d, r5d +.loopW: +; signDown +; stats[edgeType] +; edgeType +movum1, [r1] +movum2, [r1 + r2 + 1 * SIZEOF_PIXEL] +pcmpgtw m3, m1, m2 +pcmpgtw m2, m1 +pandm2, m5 +por m3, m2 + +movum1, [r1 + mmsize] +movum2, [r1 + r2 + 1 * SIZEOF_PIXEL + mmsize] +pcmpgtw m4, m1, m2 +pcmpgtw m2, m1 +pandm2, m5 +por m4, m2 +packsswbm3, m4 + +movum4, [r3] +paddb m4, m6 +psubb m4, m3 + +; update upBuff1 +movu[r4 + 1], m3 + +; 16 pixels +%assign x 0 +%rep 16 +pextrb r7d, m4, x +incword [rsp + r7 * 2] + +movsx r8d, word [r0 + x * 2] +add [rsp + 5 * 2 + r7 * 4], r8d + +dec r6d +jz .next +%assign x x+1 +%endrep + +add r0, mmsize * 2 +add r1, mmsize * SIZEOF_PIXEL +add r3, mmsize +add r4, mmsize +jmp.loopW + +.next: +xchgr3, r4 + +; restore pointer upBuff1 +mov r6d, r5d +and r6d, ~15 +neg r6 ; MUST BE 64-bits, it is Negtive + +; move to next row + +; move back to start point +add r3, r6 +add r4, r6 + +; adjust with stride +lea r0, [r0 + (r6 + 64) * 2]; 64 = MAX_CU_SIZE +add r1, r2 +lea r1, [r1 + r6 * SIZEOF_PIXEL] + +; restore unavailable pixels +movh[r3 + r5 + 1], m0 + +decbyte r6m +jg .loopH + +; sum to global buffer +mov r1, r7m +mov r0, r8m + +; s_eoTable = {1,2,0,3,4} +pmovzxwdm0, [rsp + 0 * 2] +pshufd m0, m0, q3102 +movum1, [r0] +paddd m0, m1 +movu[r0], m0 +movzx r5d, word [rsp + 4 * 2] +add [r0 + 4 * 4], r5d + +movum0, [rsp + 5 * 2 + 0 * 4] +pshufd m0, m0, q3102 +movum1, [r1] +paddd m0, m1 +movu[r1], m0 +mov r6d, [rsp + 5 * 2 + 4 * 4] +add [r1 + 4 * 4], r6d +RET + +%else ; HIGH_BIT_DEPTH == 1 + ; TODO: x64 only because I need temporary register r7,r8, easy portab to x86 INIT_XMM sse4 cglobal saoCuStatsE2, 5,9,8,0-32; Stack: 5 of stats and 5 of count @@ -2989,6 +3112,7 @@ add [r1 + 4 * 4], r6d RET +%endif ; HIGH_BIT_DEPTH == 0 INIT_YMM avx2 cglobal saoCuStatsE2, 5,10,16; Stack: 5 of stats and 5 of count @@ -3216,6 +3340,119 @@ ;} %if ARCH_X86_64 + +%if HIGH_BIT_DEPTH == 1 +INIT_XMM sse4 +cglobal saoCuStatsE3, 4,9,8,0-32; Stack: 5 of stats and 5 of count +mov r4d, r4m +mov r5d, r5m +FIX_STRIDES r2d + +; clear internal temporary buffer +pxorm0, m0 +mova[rsp], m0 +mova[rsp + mmsize], m0 +;movam0, [pb_128] +movam5, [pw_1] +movam6, [pb_2] +movhm7, [r3 + r4] + +.loopH: +mov r6d, r4d + +.loopW: +; signDown +movum1, [r1] +movum2, [r1 + r2 - 1 * SIZEOF_PIXEL] +pcmpgtw m3, m1, m2 +pcmpgtw m2, m1 +pandm2, m5 +por
[x265] [PATCH] fix VBV hash mistake on re-encode mode
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1455683982 -28800 # Node ID 07986e2a495a915d3ffe86fae29298b46724b5fa # Parent 425b583f25dbb57af86fc5c128548038954baf31 fix VBV hash mistake on re-encode mode --- source/encoder/frameencoder.cpp | 64 -- 1 files changed, 34 insertions(+), 30 deletions(-) diff -r 425b583f25db -r 07986e2a495a source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Thu Feb 11 13:15:03 2016 +0530 +++ b/source/encoder/frameencoder.cpp Wed Feb 17 12:39:42 2016 +0800 @@ -969,44 +969,48 @@ /* Deblock with idle threading */ if (m_param->bEnableLoopFilter | m_param->bEnableSAO) { -// TODO: Multiple Threading -// Delay ONE row to avoid Intra Prediction Conflict -if (m_pool && (row >= 1)) +// NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO +if (!bIsVbv) { -// Waitting last threading finish -m_frameFilter.m_parallelFilter[row - 1].waitForExit(); +// TODO: Multiple Threading +// Delay ONE row to avoid Intra Prediction Conflict +if (m_pool && (row >= 1)) +{ +// Waitting last threading finish +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); -// Processing new group -int allowCol = col; +// Processing new group +int allowCol = col; -// avoid race condition on last column -if (row >= 2) +// avoid race condition on last column +if (row >= 2) +{ +allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() + : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col); +} +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol); +m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1); +} + +// Last Row may start early +if (m_pool && (row == m_numRows - 1)) { -allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() - : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col); +// Waiting for the last thread to finish +m_frameFilter.m_parallelFilter[row].waitForExit(); + +// Deblocking last row +int allowCol = col; + +// avoid race condition on last column +if (row >= 2) +{ +allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() + : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col); +} + m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol); +m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1); } -m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol); -m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1); -} - -// Last Row may start early -if (m_pool && (row == m_numRows - 1)) -{ -// Waiting for the last thread to finish -m_frameFilter.m_parallelFilter[row].waitForExit(); - -// Deblocking last row -int allowCol = col; - -// avoid race condition on last column -if (row >= 2) -{ -allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() - : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col); -} -m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol); -m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1); -} +} // end of !bIsVbv } // Both Loopfilter and SAO Disabled else ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] Alignment struct size to 16 bytes to avoid address compute cost
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1454669596 -28800 # Node ID 00d5efa52f591d6d8c8f8681c830fb75ea09c594 # Parent 33724cfd90827f3534ce26fbd7797946e2219208 Alignment struct size to 16 bytes to avoid address compute cost diff -r 33724cfd9082 -r 00d5efa52f59 source/encoder/slicetype.h --- a/source/encoder/slicetype.hThu Feb 04 13:29:41 2016 +0800 +++ b/source/encoder/slicetype.hFri Feb 05 18:53:16 2016 +0800 @@ -213,6 +213,7 @@ int costEst; int costEstAq; int intraMbs; +int _pad_to_16bytes; } m_slice[MAX_COOP_SLICES]; int64_t singleCost(int p0, int p1, int b, bool intraPenalty = false); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2] sao: avoid reduce copy by check next cu status
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1454563781 -28800 # Node ID 33724cfd90827f3534ce26fbd7797946e2219208 # Parent ad8ebeffdda44378dd93b787215a937a26be980e sao: avoid reduce copy by check next cu status --- source/encoder/sao.cpp |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff -r ad8ebeffdda4 -r 33724cfd9082 source/encoder/sao.cpp --- a/source/encoder/sao.cppThu Feb 04 13:29:38 2016 +0800 +++ b/source/encoder/sao.cppThu Feb 04 13:29:41 2016 +0800 @@ -630,7 +630,7 @@ bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT; int typeIdx = ctuParam[addr].typeIdx; -if (idxX != (m_numCuInWidth - 1)) +if ((idxX != (m_numCuInWidth - 1)) & (ctuParam[addr + 1].typeIdx >= 0)) { rec = reconPic->getPlaneAddr(plane, addr); for (int i = 0; i < ctuHeight + 1; i++) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 2] improve performance by full row process
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1454563778 -28800 # Node ID ad8ebeffdda44378dd93b787215a937a26be980e # Parent dc62b47dd0d98f732165345883edac55320baec1 improve performance by full row process --- source/encoder/framefilter.cpp | 197 +-- source/encoder/framefilter.h |4 + source/encoder/sao.cpp | 224 source/encoder/sao.h |4 +- 4 files changed, 418 insertions(+), 11 deletions(-) diff -r dc62b47dd0d9 -r ad8ebeffdda4 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Jan 25 14:59:50 2016 +0530 +++ b/source/encoder/framefilter.cppThu Feb 04 13:29:38 2016 +0800 @@ -174,6 +174,22 @@ restoreOrigLosslessYuv(cu, frame, absPartIdx); } +void FrameFilter::ParallelFilter::processSaoPcmRow(int startCol) +{ +if (m_encData->m_slice->m_pps->bTransquantBypassEnabled) +{ +const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms; +const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap; + +for (int col = startCol; col < m_frameFilter->m_numCols; col++) +{ +uint32_t cuAddr = m_rowAddr + col; +const CUData* ctu = m_encData->getPicCTU(cuAddr); +origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frameFilter->m_frame); +} +} +} + void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col) { // Copy SAO Top Reference Pixels @@ -182,7 +198,7 @@ // Luma memcpy(_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel)); -X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected"); +X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected"); // Chroma if (m_frameFilter->m_param->internalCsp != X265_CSP_I400) @@ -194,7 +210,32 @@ memcpy(_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel)); memcpy(_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel)); -X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected"); +X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected"); +} +} + +void FrameFilter::ParallelFilter::copySaoAboveRefRow(PicYuv* reconPic, uint32_t cuAddr, int col) +{ +// Copy SAO Top Reference Pixels +int ctuWidth = g_maxCUSize; +const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride); +const int cntCols = (m_frameFilter->m_numCols - col); + +// Luma +memcpy(_sao.m_tmpU[0][col * ctuWidth], recY, cntCols * ctuWidth * sizeof(pixel)); +X265_CHECK(col * ctuWidth + cntCols * ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer write beyond bound detected"); + +// Chroma +if (m_frameFilter->m_param->internalCsp != X265_CSP_I400) +{ +ctuWidth >>= m_sao.m_hChromaShift; + +const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC); +const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC); +memcpy(_sao.m_tmpU[1][col * ctuWidth], recU, cntCols * ctuWidth * sizeof(pixel)); +memcpy(_sao.m_tmpU[2][col * ctuWidth], recV, cntCols * ctuWidth * sizeof(pixel)); + +X265_CHECK(col * ctuWidth + cntCols * ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound detected"); } } @@ -243,7 +284,7 @@ const intptr_t stride = reconPic->m_stride; const intptr_t strideC = reconPic->m_strideC; pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr); -// // MUST BE check I400 since m_picOrg uninitialize in that case +// MUST BE check I400 since m_picOrg uninitialize in that case pixel *pixU = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCbAddr(lineStartCUAddr) : NULL; pixel *pixV = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCrAddr(lineStartCUAddr) : NULL; int copySizeY = realW; @@ -312,6 +353,79 @@ } } +void FrameFilter::ParallelFilter::processPostRow() const +{ + +PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic; + +const uint32_t lumaMarginX = reconPic->m_lumaMarginX; +const uint32_t lumaMarginY = reconPic->m_lumaMarginY; +const uint32_t chromaMarginX = reconPic->m_chromaMarginX; +const uint32_t chromaMarginY = reconPic->m_chromaMarginY; +const int hChromaShift = reconPic->m_hChromaShift; +const int vCh
[x265] [PATCH] asm: disable AVX2 version mbtree_propagate_cost to avoid output change
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1453202938 -28800 # Node ID 765864c3c6f02e2a3ec426974de7df7bbec7de58 # Parent 792f6ead9c50673aafd588fc2fdc0802f59d21fc asm: disable AVX2 version mbtree_propagate_cost to avoid output change --- source/common/x86/mc-a2.asm | 24 1 files changed, 12 insertions(+), 12 deletions(-) diff -r 792f6ead9c50 -r 765864c3c6f0 source/common/x86/mc-a2.asm --- a/source/common/x86/mc-a2.asm Thu Jan 14 13:35:36 2016 +0530 +++ b/source/common/x86/mc-a2.asm Tue Jan 19 19:28:58 2016 +0800 @@ -1113,12 +1113,12 @@ pmulld xm0, xm2 cvtdq2pdm0, xm0 cvtdq2pdm1, xm1 ; prop -%if cpuflag(avx2) -fmaddpd m0, m0, m6, m1 -%else +;%if cpuflag(avx2) +;fmaddpd m0, m0, m6, m1 +;%else mulpd m0, m6 ; intra*invq*fps_factor>>8 addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) -%endif +;%endif cvtdq2pdm1, xm2 ; intra psubd xm2, xm3; intra - inter cvtdq2pdm2, xm2 ; intra - inter @@ -1155,12 +1155,12 @@ pmulld xm0, xm2 cvtdq2pdm0, xm0 cvtdq2pdm1, xm1 ; prop -%if cpuflag(avx2) -fmaddpd m0, m0, m6, m1 -%else +;%if cpuflag(avx2) +;fmaddpd m0, m0, m6, m1 +;%else mulpd m0, m6 ; intra*invq*fps_factor>>8 addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) -%endif +;%endif cvtdq2pdm1, xm2 ; intra psubd xm2, xm3; intra - inter cvtdq2pdm2, xm2 ; intra - inter @@ -1189,12 +1189,12 @@ pmulld xm0, xm2 cvtdq2pdm0, xm0 cvtdq2pdm1, xm1 ; prop -%if cpuflag(avx2) -fmaddpd m0, m0, m6, m1 -%else +;%if cpuflag(avx2) +;fmaddpd m0, m0, m6, m1 +;%else mulpd m0, m6 ; intra*invq*fps_factor>>8 addpd m0, m1 ; prop + (intra*invq*fps_factor>>8) -%endif +;%endif cvtdq2pdm1, xm2 ; intra psubd xm2, xm3; intra - inter cvtdq2pdm2, xm2 ; intra - inter ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] refactor on FrameFilter and ParallelFilter, reduce duplicated data field
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1453204681 -28800 # Node ID 08dadacfe2cddfdea2c3a1e6f523c17ffa74bf09 # Parent 765864c3c6f02e2a3ec426974de7df7bbec7de58 refactor on FrameFilter and ParallelFilter, reduce duplicated data field --- source/encoder/framefilter.cpp | 113 +++ source/encoder/framefilter.h | 32 --- 2 files changed, 67 insertions(+), 78 deletions(-) diff -r 765864c3c6f0 -r 08dadacfe2cd source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppTue Jan 19 19:28:58 2016 +0800 +++ b/source/encoder/framefilter.cppTue Jan 19 19:58:01 2016 +0800 @@ -57,12 +57,14 @@ m_param = top->m_param; m_frameEncoder = frame; m_numRows = numRows; +m_numCols = numCols; m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp); m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp); m_pad[0] = top->m_sps.conformanceWindow.rightOffset; m_pad[1] = top->m_sps.conformanceWindow.bottomOffset; m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0; m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize; +m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize; if (m_param->bEnableSsim) m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3)); @@ -86,18 +88,13 @@ } } -const int lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize; for(int row = 0; row < numRows; row++) { // Setting maximum bound information -m_parallelFilter[row].m_numCols = numCols; -m_parallelFilter[row].m_numRows = numRows; m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : g_maxCUSize; -m_parallelFilter[row].m_lastWidth = lastWidth; -m_parallelFilter[row].m_param = m_param; m_parallelFilter[row].m_row = row; m_parallelFilter[row].m_rowAddr = row * numCols; -m_parallelFilter[row].m_frameEncoder = m_frameEncoder; +m_parallelFilter[row].m_frameFilter = this; if (row > 0) m_parallelFilter[row].m_prevRow = _parallelFilter[row - 1]; @@ -122,7 +119,6 @@ m_parallelFilter[row].m_allowedCol.set(0); m_parallelFilter[row].m_lastDeblocked.set(-1); m_parallelFilter[row].m_encData = frame->m_encData; -m_parallelFilter[row].m_frame = frame; } // Reset SAO common statistics @@ -189,7 +185,7 @@ X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected"); // Chroma -if (m_param->internalCsp != X265_CSP_I400) +if (m_frameFilter->m_param->internalCsp != X265_CSP_I400) { ctuWidth >>= m_sao.m_hChromaShift; @@ -213,30 +209,30 @@ if (m_encData->m_slice->m_pps->bTransquantBypassEnabled) { -const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms; -const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap; +const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms; +const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap; uint32_t cuAddr = m_rowAddr + col; const CUData* ctu = m_encData->getPicCTU(cuAddr); -assert(m_frame->m_reconPic == m_encData->m_reconPic); -origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frame); +assert(m_frameFilter->m_frame->m_reconPic == m_encData->m_reconPic); +origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frameFilter->m_frame); } } // NOTE: MUST BE delay a row when Deblock enabled, the Deblock will modify above pixels in Horizon pass -void FrameFilter::ParallelFilter::processPostCu(uint32_t col) const +void FrameFilter::ParallelFilter::processPostCu(int col) const { // Update finished CU cursor -m_frame->m_reconColCount[m_row].set(col); +m_frameFilter->m_frame->m_reconColCount[m_row].set(col); // shortcut path for non-border area -if ((col != 0) & (col != m_numCols - 1) & (m_row != 0) & (m_row != m_numRows - 1)) +if ((col != 0) & (col != m_frameFilter->m_numCols - 1) & (m_row != 0) & (m_row != m_frameFilter->m_numRows - 1)) return; -PicYuv *reconPic = m_frame->m_reconPic; +PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic; const uint32_t lineStartCUAddr = m_rowAddr + col; const int realH = getCUHeight(); -const int realW = getCUWidth(col); +const int realW = m_frameFilter->getCUWidth(col); const uint32_t lumaMarginX = reconPic->m_lumaMarginX;
[x265] [PATCH] fix typo error on patch 'fix non-determination output after apply new preset parameter sets'
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1453270393 -28800 # Node ID 0a738dd3ae11d1621f99f8006cb7a507cde5069f # Parent 808ece071d225f300feaf08709a9f5e0872edc89 fix typo error on patch 'fix non-determination output after apply new preset parameter sets' --- source/encoder/analysis.cpp |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff -r 808ece071d22 -r 0a738dd3ae11 source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Mon Jan 18 21:21:25 2016 +0530 +++ b/source/encoder/analysis.cpp Wed Jan 20 14:13:13 2016 +0800 @@ -912,7 +912,7 @@ { md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0 md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1 -md.pred[PRED_2Nx2N].rdCost = 0; +md.pred[PRED_2Nx2N].sa8dCost = 0; } /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */ ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] fix crash when no-sao & no-lft
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1452615873 21600 # Node ID b0b34836e66dd792c51c3e1dde90054a154b1867 # Parent 6ccd503a4c3a2f6ed215584859cdf35ee7b80bd9 fix crash when no-sao & no-lft --- source/encoder/frameencoder.cpp |2 +- source/encoder/framefilter.cpp | 98 +-- source/encoder/framefilter.h|1 - 3 files changed, 2 insertions(+), 99 deletions(-) diff -r 6ccd503a4c3a -r b0b34836e66d source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Tue Jan 12 11:36:55 2016 +0530 +++ b/source/encoder/frameencoder.cpp Tue Jan 12 10:24:33 2016 -0600 @@ -1011,7 +1011,7 @@ // Both Loopfilter and SAO Disabled else { -m_frameFilter.processPostCu(row, col); +m_frameFilter.m_parallelFilter[row].processPostCu(col); } // Completed CU processing diff -r 6ccd503a4c3a -r b0b34836e66d source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppTue Jan 12 11:36:55 2016 +0530 +++ b/source/encoder/framefilter.cppTue Jan 12 10:24:33 2016 -0600 @@ -67,8 +67,7 @@ if (m_param->bEnableSsim) m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3)); -if (m_param->bEnableLoopFilter | m_param->bEnableSAO) -m_parallelFilter = new ParallelFilter[numRows]; +m_parallelFilter = new ParallelFilter[numRows]; if (m_parallelFilter) { @@ -514,101 +513,6 @@ } } -// NOTE: This version for case that Disable both Deblock and Sao -void FrameFilter::processPostCu(uint32_t row, uint32_t col) const -{ -// Update finished CU cursor -m_frame->m_reconColCount[row].set(col); - -// shortcut path for non-border area -if ((col != 0) & (col != m_parallelFilter[row].m_numCols - 1) & (row != 0) & (row != m_parallelFilter[row].m_numRows - 1)) -return; - -PicYuv *reconPic = m_frame->m_reconPic; -const uint32_t rowAddr = row * m_parallelFilter[row].m_numCols; -const uint32_t lineStartCUAddr = rowAddr + col; -const int realH = m_parallelFilter[row].getCUHeight(); -const int realW = m_parallelFilter[row].getCUWidth(col); - -const uint32_t lumaMarginX = reconPic->m_lumaMarginX; -const uint32_t lumaMarginY = reconPic->m_lumaMarginY; -const uint32_t chromaMarginX = reconPic->m_chromaMarginX; -const uint32_t chromaMarginY = reconPic->m_chromaMarginY; -const int hChromaShift = reconPic->m_hChromaShift; -const int vChromaShift = reconPic->m_vChromaShift; -const intptr_t stride = reconPic->m_stride; -const intptr_t strideC = reconPic->m_strideC; -pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr); -// MUST BE check I400 since m_picOrg uninitialize in that case -pixel *pixU = (m_param->internalCsp != X265_CSP_I400) ? reconPic->getCbAddr(lineStartCUAddr) : NULL; -pixel *pixV = (m_param->internalCsp != X265_CSP_I400) ? reconPic->getCrAddr(lineStartCUAddr) : NULL; -int copySizeY = realW; -int copySizeC = (realW >> hChromaShift); - -if ((col == 0) | (col == m_parallelFilter[row].m_numCols - 1)) -{ -// TODO: improve by process on Left or Right only -primitives.extendRowBorder(reconPic->getLumaAddr(rowAddr), stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX); - -if (m_param->internalCsp != X265_CSP_I400) -{ -primitives.extendRowBorder(reconPic->getCbAddr(rowAddr), strideC, reconPic->m_picWidth >> hChromaShift, realH >> vChromaShift, reconPic->m_chromaMarginX); -primitives.extendRowBorder(reconPic->getCrAddr(rowAddr), strideC, reconPic->m_picWidth >> hChromaShift, realH >> vChromaShift, reconPic->m_chromaMarginX); -} -} - -// Extra Left and Right border on first and last CU -if ((col == 0) | (col == m_parallelFilter[row].m_numCols - 1)) -{ -copySizeY += lumaMarginX; -copySizeC += chromaMarginX; -} - -// First column need extension left padding area and first CU -if (col == 0) -{ -pixY -= lumaMarginX; -pixU -= chromaMarginX; -pixV -= chromaMarginX; -} - -// Border extend Top -if (row == 0) -{ -for (uint32_t y = 0; y < lumaMarginY; y++) -memcpy(pixY - (y + 1) * stride, pixY, copySizeY * sizeof(pixel)); - -if (m_param->internalCsp != X265_CSP_I400) -{ -for (uint32_t y = 0; y < chromaMarginY; y++) -{ -memcpy(pixU - (y + 1) * strideC, pixU, copySizeC * sizeof(pixel)); -memcpy(pixV - (y + 1) * strideC, pixV, copySizeC * sizeof(pixel)); -} -} -} - -// Border extend Bottom -if (row == m_parallelFilter[row].m_numRows - 1) -{ -pixY += (realH - 1) * stride; -for (uint32_t y = 0; y <
[x265] [PATCH] fix deadlock and output change on new ParallelFilter framework. (Issue #225)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1452545402 21600 # Node ID 1c273c83a4478ed39b0e6734eec1ba1cfccd07d6 # Parent 19cfada7162147f293e37302e4c7f9c1760928a0 fix deadlock and output change on new ParallelFilter framework. (Issue #225) The bug from two part: 1. we old sync system allow latest column execute parallelism 2. new ParallelFilter logic will distribute threads and waitting the status change, there have a race conditions --- source/encoder/frameencoder.cpp | 108 +++ 1 files changed, 53 insertions(+), 55 deletions(-) diff -r 19cfada71621 -r 1c273c83a447 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Jan 11 16:28:39 2016 +0530 +++ b/source/encoder/frameencoder.cpp Mon Jan 11 14:50:02 2016 -0600 @@ -962,6 +962,58 @@ // Save CABAC state for next row curRow.bufferedEntropy.loadContexts(rowCoder); +/* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */ +if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) + m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); + +/* Deblock with idle threading */ +if (m_param->bEnableLoopFilter | m_param->bEnableSAO) +{ +// TODO: Multiple Threading +// Delay ONE row to avoid Intra Prediction Conflict +if (m_pool && (row >= 1)) +{ +// Waitting last threading finish +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); + +// Processing new group +int allowCol = col; + +// avoid race condition on last column +if (row >= 2) +{ +allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() + : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col); +} +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol); +m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1); +} + +// Last Row may start early +if (m_pool && (row == m_numRows - 1)) +{ +// Waiting for the last thread to finish +m_frameFilter.m_parallelFilter[row].waitForExit(); + +// Deblocking last row +int allowCol = col; + +// avoid race condition on last column +if (row >= 2) +{ +allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() + : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col); +} +m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol); +m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1); +} +} +// Both Loopfilter and SAO Disabled +else +{ +m_frameFilter.processPostCu(row, col); +} + // Completed CU processing curRow.completed++; @@ -1091,60 +1143,6 @@ } } -// TODO: move Deblock and SAO to before VBV check - -/* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */ -if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) - m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); - -/* Deblock with idle threading */ -if (m_param->bEnableLoopFilter | m_param->bEnableSAO) -{ -// TODO: Multiple Threading -// Delay ONE row to avoid Intra Prediction Conflict -if (m_pool && (row >= 1)) -{ -// Waitting last threading finish -m_frameFilter.m_parallelFilter[row - 1].waitForExit(); - -// Processing new group -int allowCol = col; - -// avoid race condition on last column -if (row >= 2) -{ -allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() - : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col); -} -m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol); -m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1); -} - -// Last Row may start early -if (m_pool && (ro
[x265] [PATCH] fix crash on no-wpp mode (Issue #217)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1452096133 21600 # Node ID 2417df96af4efcaa8c16e7138028d8b2bbb034ab # Parent 25f78ff3d8efaa1e9d85bc3e718c887ec9afa557 fix crash on no-wpp mode (Issue #217) --- source/encoder/frameencoder.cpp |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff -r 25f78ff3d8ef -r 2417df96af4e source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Tue Dec 22 18:13:28 2015 +0530 +++ b/source/encoder/frameencoder.cpp Wed Jan 06 10:02:13 2016 -0600 @@ -1102,7 +1102,7 @@ { // TODO: Multiple Threading // Delay ONE row to avoid Intra Prediction Conflict -if (row >= 1) +if (m_pool && (row >= 1)) { // Waitting last threading finish m_frameFilter.m_parallelFilter[row - 1].waitForExit(); @@ -1121,7 +1121,7 @@ } // Last Row may start early -if (row == m_numRows - 1) +if (m_pool && (row == m_numRows - 1)) { // Waiting for the last thread to finish m_frameFilter.m_parallelFilter[row].waitForExit(); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 2] remove static member from class ParallelFilter
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1452121207 21600 # Node ID 375c1f0cfd8a69e4a118cf4e4094f871c0c3a216 # Parent 2417df96af4efcaa8c16e7138028d8b2bbb034ab remove static member from class ParallelFilter --- source/encoder/framefilter.cpp | 93 +++- source/encoder/framefilter.h | 22 ++ 2 files changed, 57 insertions(+), 58 deletions(-) diff -r 2417df96af4e -r 375c1f0cfd8a source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Jan 06 10:02:13 2016 -0600 +++ b/source/encoder/framefilter.cppWed Jan 06 17:00:07 2016 -0600 @@ -35,11 +35,6 @@ static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height); static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt); -uint32_t FrameFilter::ParallelFilter::numCols = 0; -uint32_t FrameFilter::ParallelFilter::numRows = 0; -uint32_t FrameFilter::ParallelFilter::lastHeight = 0; -uint32_t FrameFilter::ParallelFilter::lastWidth = 0; - void FrameFilter::destroy() { X265_FREE(m_ssimBuf); @@ -94,6 +89,11 @@ for(int row = 0; row < numRows; row++) { +// Setting maximum bound information +m_parallelFilter[row].m_numCols = numCols; +m_parallelFilter[row].m_numRows = numRows; +m_parallelFilter[row].m_lastHeight = m_lastHeight; +m_parallelFilter[row].m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize; m_parallelFilter[row].m_param = m_param; m_parallelFilter[row].m_row = row; m_parallelFilter[row].m_rowAddr = row * numCols; @@ -104,11 +104,6 @@ } } -// Setting maximum columns -ParallelFilter::numCols = numCols; -ParallelFilter::numRows = numRows; -ParallelFilter::lastHeight = m_lastHeight; -ParallelFilter::lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize; } void FrameFilter::start(Frame *frame, Entropy& initState, int qp) @@ -235,7 +230,7 @@ m_frame->m_reconColCount[m_row].set(col); // shortcut path for non-border area -if ((col != 0) & (col != numCols - 1) & (m_row != 0) & (m_row != numRows - 1)) +if ((col != 0) & (col != m_numCols - 1) & (m_row != 0) & (m_row != m_numRows - 1)) return; PicYuv *reconPic = m_frame->m_reconPic; @@ -258,7 +253,7 @@ int copySizeY = realW; int copySizeC = (realW >> hChromaShift); -if ((col == 0) | (col == numCols - 1)) +if ((col == 0) | (col == m_numCols - 1)) { // TODO: improve by process on Left or Right only primitives.extendRowBorder(reconPic->getLumaAddr(m_rowAddr), stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX); @@ -271,7 +266,7 @@ } // Extra Left and Right border on first and last CU -if ((col == 0) | (col == numCols - 1)) +if ((col == 0) | (col == m_numCols - 1)) { copySizeY += lumaMarginX; copySizeC += chromaMarginX; @@ -302,7 +297,7 @@ } // Border extend Bottom -if (m_row == numRows - 1) +if (m_row == m_numRows - 1) { pixY += (realH - 1) * stride; pixU += ((realH >> vChromaShift) - 1) * strideC; @@ -386,9 +381,9 @@ m_lastCol.incr(); } -if (colEnd == (int)numCols) +if (colEnd == (int)m_numCols) { -const uint32_t cuAddr = m_rowAddr + numCols - 1; +const uint32_t cuAddr = m_rowAddr + m_numCols - 1; if (m_param->bEnableLoopFilter) { @@ -397,47 +392,47 @@ // When SAO Disable, setting column counter here if ((!m_param->bEnableSAO) & (m_row >= 1)) -m_prevRow->processPostCu(numCols - 1); +m_prevRow->processPostCu(m_numCols - 1); } // TODO: move processPostCu() into processSaoUnitCu() if (m_param->bEnableSAO) { // Save SAO bottom row reference pixels -copySaoAboveRef(reconPic, cuAddr, numCols - 1); +copySaoAboveRef(reconPic, cuAddr, m_numCols - 1); // SAO Decide // NOTE: reduce condition check for 1 CU only video, Why someone play with it? -if (numCols >= 2) -m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 2, cuAddr - 1); +if (m_numCols >= 2) +m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, m_numCols - 2, cuAddr - 1); -if (numCols >= 1) -m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 1, cuAddr); +if (m_numCols >= 1) +m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, m_numCols - 1, cuAddr); // Proce
[x265] [PATCH 2 of 2] improve getCUHeight()
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1452121210 21600 # Node ID d4389c744980cf69cad5c948e4d62e60057a98ba # Parent 375c1f0cfd8a69e4a118cf4e4094f871c0c3a216 improve getCUHeight() --- source/encoder/framefilter.cpp | 17 + source/encoder/framefilter.h |8 2 files changed, 13 insertions(+), 12 deletions(-) diff -r 375c1f0cfd8a -r d4389c744980 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Jan 06 17:00:07 2016 -0600 +++ b/source/encoder/framefilter.cppWed Jan 06 17:00:10 2016 -0600 @@ -87,13 +87,14 @@ } } +const int lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize; for(int row = 0; row < numRows; row++) { // Setting maximum bound information m_parallelFilter[row].m_numCols = numCols; m_parallelFilter[row].m_numRows = numRows; -m_parallelFilter[row].m_lastHeight = m_lastHeight; -m_parallelFilter[row].m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize; +m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : g_maxCUSize; +m_parallelFilter[row].m_lastWidth = lastWidth; m_parallelFilter[row].m_param = m_param; m_parallelFilter[row].m_row = row; m_parallelFilter[row].m_rowAddr = row * numCols; @@ -235,7 +236,7 @@ PicYuv *reconPic = m_frame->m_reconPic; const uint32_t lineStartCUAddr = m_rowAddr + col; -const int realH = getCUHeight(m_row); +const int realH = getCUHeight(); const int realW = getCUWidth(col); const uint32_t lumaMarginX = reconPic->m_lumaMarginX; @@ -526,7 +527,7 @@ PicYuv *reconPic = m_frame->m_reconPic; const uint32_t rowAddr = row * m_parallelFilter[row].m_numCols; const uint32_t lineStartCUAddr = rowAddr + col; -const int realH = m_parallelFilter[row].getCUHeight(row); +const int realH = m_parallelFilter[row].getCUHeight(); const int realW = m_parallelFilter[row].getCUWidth(col); const uint32_t lumaMarginX = reconPic->m_lumaMarginX; @@ -624,7 +625,7 @@ intptr_t stride = reconPic->m_stride; uint32_t width = reconPic->m_picWidth - m_pad[0]; -uint32_t height = m_parallelFilter[row].getCUHeight(row); +uint32_t height = m_parallelFilter[row].getCUHeight(); uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height); m_frameEncoder->m_SSDY += ssdY; @@ -664,7 +665,7 @@ } if (m_param->decodedPictureHashSEI == 1) { -uint32_t height = m_parallelFilter[row].getCUHeight(row); +uint32_t height = m_parallelFilter[row].getCUHeight(); uint32_t width = reconPic->m_picWidth; intptr_t stride = reconPic->m_stride; @@ -690,7 +691,7 @@ } else if (m_param->decodedPictureHashSEI == 2) { -uint32_t height = m_parallelFilter[row].getCUHeight(row); +uint32_t height = m_parallelFilter[row].getCUHeight(); uint32_t width = reconPic->m_picWidth; intptr_t stride = reconPic->m_stride; @@ -712,7 +713,7 @@ else if (m_param->decodedPictureHashSEI == 3) { uint32_t width = reconPic->m_picWidth; -uint32_t height = m_parallelFilter[row].getCUHeight(row); +uint32_t height = m_parallelFilter[row].getCUHeight(); intptr_t stride = reconPic->m_stride; uint32_t cuHeight = g_maxCUSize; diff -r 375c1f0cfd8a -r d4389c744980 source/encoder/framefilter.h --- a/source/encoder/framefilter.h Wed Jan 06 17:00:07 2016 -0600 +++ b/source/encoder/framefilter.h Wed Jan 06 17:00:10 2016 -0600 @@ -63,7 +63,7 @@ public: uint32_tm_numCols; uint32_tm_numRows; -uint32_tm_lastHeight; +uint32_tm_rowHeight; uint32_tm_lastWidth; uint32_tm_row; uint32_tm_rowAddr; @@ -80,7 +80,7 @@ ParallelFilter() : m_numCols(0) , m_numRows(0) -, m_lastHeight(0) +, m_rowHeight(0) , m_lastWidth(0) , m_row(0) , m_rowAddr(0) @@ -106,9 +106,9 @@ // Post-Process (Border extension) void processPostCu(uint32_t col) const; -uint32_t getCUHeight(int rowNum) const +uint32_t getCUHeight() const { -return (rowNum == (int)m_numRows - 1) ? m_lastHeight : g_maxCUSize; +return m_rowHeight; } uint32_t getCUWidth(int colNum) const ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2] asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451581652 21600 # Node ID 59863bb9bd70f81af21c8cf53e2703adc2723a87 # Parent 9e0fe9704998425e8d014fdfdb3c12f24e6c3cd9 asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223) --- source/common/x86/pixel-a.asm | 327 ++--- source/test/pixelharness.cpp | 25 +++- 2 files changed, 103 insertions(+), 249 deletions(-) diff -r 9e0fe9704998 -r 59863bb9bd70 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Thu Dec 31 10:29:43 2015 -0600 +++ b/source/common/x86/pixel-a.asm Thu Dec 31 11:07:32 2015 -0600 @@ -8154,92 +8154,57 @@ ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ; INIT_XMM sse2 -cglobal downShift_16, 7,7,3 -movdm0, r6d; m0 = shift +cglobal downShift_16, 4,7,3 +mov r4d, r4m +mov r5d, r5m +movdm0, r6m; m0 = shift add r1, r1 + dec r5d .loopH: xor r6, r6 + .loopW: movum1, [r0 + r6 * 2] -movum2, [r0 + r6 * 2 + 16] +movum2, [r0 + r6 * 2 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswbm1, m2 movu[r2 + r6], m1 -add r6, 16 +add r6, mmsize cmp r6d, r4d -jl .loopW +jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d -jnz .loopH - -;processing last row of every frame [To handle width which not a multiple of 16] - +jnz.loopH + +;processing last row of every frame [To handle width which not a multiple of 16] +; r4d must be more than or equal to 16(mmsize) .loop16: +movum1, [r0 + (r4 - mmsize) * 2] +movum2, [r0 + (r4 - mmsize) * 2 + mmsize] +psrlw m1, m0 +psrlw m2, m0 +packuswbm1, m2 +movu[r2 + r4 - mmsize], m1 + +sub r4d, mmsize +jz .end +cmp r4d, mmsize +jge.loop16 + +; process partial pixels movum1, [r0] -movum2, [r0 + 16] +movum2, [r0 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswbm1, m2 movu[r2], m1 -add r0, 2 * mmsize -add r2, mmsize -sub r4d, 16 -jz .end -cmp r4d, 15 -jg .loop16 - -cmp r4d, 8 -jl .process4 -movum1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movh[r2], m1 - -add r0, mmsize -add r2, 8 -sub r4d, 8 -jz .end - -.process4: -cmp r4d, 4 -jl .process2 -movhm1,[r0] -psrlw m1, m0 -packuswbm1, m1 -movd[r2], m1 - -add r0, 8 -add r2, 4 -sub r4d, 4 -jz .end - -.process2: -cmp r4d, 2 -jl .process1 -movdm1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movdr6, m1 -mov [r2], r6w - -add r0, 4 -add r2, 2 -sub r4d, 2 -jz .end - -.process1: -movdm1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movdr3, m1 -mov [r2], r3b .end: RET @@ -8248,12 +8213,16 @@ ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;- INIT_YMM avx2 -cglobal downShift_16, 6,7,3 +cglobal downShift_16, 4,7,3 +mov r4d, r4m +mov r5d, r5m movdxm0, r6m; m0 = shift add r1d, r1d + dec r5d .loopH: xor r6, r6 + .loopW: movum1, [r0 + r6 * 2 + 0] movum2, [r0 + r6 * 2 + 32] @@ -8265,92 +8234,39 @@ add r6d, mmsize cmp r6d, r4d -jl .loopW +jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d -jnz .loopH - -; processing last row of every frame [To handle width which not a multiple of 32] -mov r6d, r4d -and r4d, 31 -shr r6d, 5 +jnz.loopH + +; processing last row of every frame [To handle width which not a multiple of 32] .loop32: -movum1, [r0] -movum2, [r0 + 32] +movum1, [r0 + (r4 - mmsize) * 2] +movum2, [r0 + (r4 - mmsize) * 2 + mmsize] psrlw m1, xm0
[x265] [PATCH 1 of 2] optimize sync logic to improve speed on preset medium and below
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451579383 21600 # Node ID 9e0fe9704998425e8d014fdfdb3c12f24e6c3cd9 # Parent 375ce77b8c35ae332bf51085f6d26044d55ae264 optimize sync logic to improve speed on preset medium and below --- source/encoder/frameencoder.cpp | 59 +- source/encoder/framefilter.cpp | 37 source/encoder/framefilter.h|2 +- 3 files changed, 46 insertions(+), 52 deletions(-) diff -r 375ce77b8c35 -r 9e0fe9704998 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 28 16:06:55 2015 -0600 +++ b/source/encoder/frameencoder.cpp Thu Dec 31 10:29:43 2015 -0600 @@ -1204,64 +1204,21 @@ rowCoder.finishSlice(); /* Processing left Deblock block with current threading */ -if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1)) +if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 2)) { /* TODO: Multiple Threading */ -/* Check to avoid previous row process slower than current row */ -if (row >= 2) + +/* Check conditional to start previous row process with current threading */ +if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols) { -int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get(); -while(prevCol != (int)numCols) -prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.waitForChange(prevCol); +/* stop threading on current row and restart it */ +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); +m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); } -m_frameFilter.m_parallelFilter[row - 1].waitForExit(); -m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); -m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); } /* trigger row-wise loop filters */ -if (row == m_numRows - 1) -{ -/* TODO: Early start last row */ -if (m_param->bEnableLoopFilter | m_param->bEnableSAO) -{ -if (m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() != (int)numCols) -x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n"); - -// avoid race on last row and last column -if (row >= 1) -{ -int prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get(); -while(prevCol != (int)numCols) -prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.waitForChange(prevCol); -} - -/* NOTE: Last Row not execute before, so didn't need wait */ -m_frameFilter.m_parallelFilter[row].waitForExit(); -m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols); -m_frameFilter.m_parallelFilter[row].processTasks(-1); - -/* Apply SAO on last row of CUs, because we always apply SAO on row[X-1] */ -if (m_param->bEnableSAO) -{ -FrameData* encData = m_frameFilter.m_parallelFilter[row].m_encData; -SAOParam* saoParam = encData->m_saoParam; -for(uint32_t col = 0; col < numCols; col++) -{ -// NOTE: must use processSaoUnitCu(), it include TQBypass logic - m_frameFilter.m_parallelFilter[row].processSaoUnitCu(saoParam, col); -} -} - -// Process border extension on last row -for(uint32_t col = 0; col < numCols; col++) -{ -// m_reconColCount will be set in processPostCu() -m_frameFilter.m_parallelFilter[row].processPostCu(col); -} -} -} - if (m_param->bEnableWavefront) { if (row >= m_filterRowDelay) diff -r 375ce77b8c35 -r 9e0fe9704998 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 28 16:06:55 2015 -0600 +++ b/source/encoder/framefilter.cppThu Dec 31 10:29:43 2015 -0600 @@ -460,6 +460,43 @@ // SAO: was integrate into encode loop SAOParam* saoParam = encData.m_saoParam; +/* Processing left block Deblock with current threading */ +{ +/* stop threading on current row */ +m_parallelFilter[row].waitForExit(); + +/* Check to avoid previous row process slower than current row */ +if (row >= 1) +X265_CHECK(m_parallelFilter[row - 1].m_lastDeblocked.get() == (int)ParallelFilter::numCols, "previous row not finish"); + +m_parallelFilter[row].m_allowedCol.set(ParallelFilter::numCols); +m_parallelFilter[row].p
[x265] [PATCH 2 of 2] asm: rewrite 16bpp partial pixels process code on upShift and downShift
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451580330 21600 # Node ID 680f058d2d58ca7e3df9d8cc742335339abe0be8 # Parent 9e0fe9704998425e8d014fdfdb3c12f24e6c3cd9 asm: rewrite 16bpp partial pixels process code on upShift and downShift --- source/common/x86/pixel-a.asm | 327 ++--- source/test/pixelharness.cpp | 25 +++- 2 files changed, 103 insertions(+), 249 deletions(-) diff -r 9e0fe9704998 -r 680f058d2d58 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Thu Dec 31 10:29:43 2015 -0600 +++ b/source/common/x86/pixel-a.asm Thu Dec 31 10:45:30 2015 -0600 @@ -8154,92 +8154,57 @@ ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ; INIT_XMM sse2 -cglobal downShift_16, 7,7,3 -movdm0, r6d; m0 = shift +cglobal downShift_16, 4,7,3 +mov r4d, r4m +mov r5d, r5m +movdm0, r6m; m0 = shift add r1, r1 + dec r5d .loopH: xor r6, r6 + .loopW: movum1, [r0 + r6 * 2] -movum2, [r0 + r6 * 2 + 16] +movum2, [r0 + r6 * 2 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswbm1, m2 movu[r2 + r6], m1 -add r6, 16 +add r6, mmsize cmp r6d, r4d -jl .loopW +jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d -jnz .loopH - -;processing last row of every frame [To handle width which not a multiple of 16] - +jnz.loopH + +;processing last row of every frame [To handle width which not a multiple of 16] +; r4d must be more than or equal to 16(mmsize) .loop16: +movum1, [r0 + (r4 - mmsize) * 2] +movum2, [r0 + (r4 - mmsize) * 2 + mmsize] +psrlw m1, m0 +psrlw m2, m0 +packuswbm1, m2 +movu[r2 + r4 - mmsize], m1 + +sub r4d, mmsize +jz .end +cmp r4d, mmsize +jge.loop16 + +; process partial pixels movum1, [r0] -movum2, [r0 + 16] +movum2, [r0 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswbm1, m2 movu[r2], m1 -add r0, 2 * mmsize -add r2, mmsize -sub r4d, 16 -jz .end -cmp r4d, 15 -jg .loop16 - -cmp r4d, 8 -jl .process4 -movum1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movh[r2], m1 - -add r0, mmsize -add r2, 8 -sub r4d, 8 -jz .end - -.process4: -cmp r4d, 4 -jl .process2 -movhm1,[r0] -psrlw m1, m0 -packuswbm1, m1 -movd[r2], m1 - -add r0, 8 -add r2, 4 -sub r4d, 4 -jz .end - -.process2: -cmp r4d, 2 -jl .process1 -movdm1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movdr6, m1 -mov [r2], r6w - -add r0, 4 -add r2, 2 -sub r4d, 2 -jz .end - -.process1: -movdm1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movdr3, m1 -mov [r2], r3b .end: RET @@ -8248,12 +8213,16 @@ ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;- INIT_YMM avx2 -cglobal downShift_16, 6,7,3 +cglobal downShift_16, 4,7,3 +mov r4d, r4m +mov r5d, r5m movdxm0, r6m; m0 = shift add r1d, r1d + dec r5d .loopH: xor r6, r6 + .loopW: movum1, [r0 + r6 * 2 + 0] movum2, [r0 + r6 * 2 + 32] @@ -8265,92 +8234,39 @@ add r6d, mmsize cmp r6d, r4d -jl .loopW +jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d -jnz .loopH - -; processing last row of every frame [To handle width which not a multiple of 32] -mov r6d, r4d -and r4d, 31 -shr r6d, 5 +jnz.loopH + +; processing last row of every frame [To handle width which not a multiple of 32] .loop32: -movum1, [r0] -movum2, [r0 + 32] +movum1, [r0 + (r4 - mmsize) * 2] +movum2, [r0 + (r4 - mmsize) * 2 + mmsize] psrlw m1, xm0 psrlw m
[x265] [PATCH 1 of 2] optimize sync logic to improve speed on preset medium and below
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451579383 21600 # Node ID 9e0fe9704998425e8d014fdfdb3c12f24e6c3cd9 # Parent 375ce77b8c35ae332bf51085f6d26044d55ae264 optimize sync logic to improve speed on preset medium and below --- source/encoder/frameencoder.cpp | 59 +- source/encoder/framefilter.cpp | 37 source/encoder/framefilter.h|2 +- 3 files changed, 46 insertions(+), 52 deletions(-) diff -r 375ce77b8c35 -r 9e0fe9704998 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 28 16:06:55 2015 -0600 +++ b/source/encoder/frameencoder.cpp Thu Dec 31 10:29:43 2015 -0600 @@ -1204,64 +1204,21 @@ rowCoder.finishSlice(); /* Processing left Deblock block with current threading */ -if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1)) +if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 2)) { /* TODO: Multiple Threading */ -/* Check to avoid previous row process slower than current row */ -if (row >= 2) + +/* Check conditional to start previous row process with current threading */ +if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols) { -int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get(); -while(prevCol != (int)numCols) -prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.waitForChange(prevCol); +/* stop threading on current row and restart it */ +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); +m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); } -m_frameFilter.m_parallelFilter[row - 1].waitForExit(); -m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); -m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); } /* trigger row-wise loop filters */ -if (row == m_numRows - 1) -{ -/* TODO: Early start last row */ -if (m_param->bEnableLoopFilter | m_param->bEnableSAO) -{ -if (m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() != (int)numCols) -x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n"); - -// avoid race on last row and last column -if (row >= 1) -{ -int prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get(); -while(prevCol != (int)numCols) -prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.waitForChange(prevCol); -} - -/* NOTE: Last Row not execute before, so didn't need wait */ -m_frameFilter.m_parallelFilter[row].waitForExit(); -m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols); -m_frameFilter.m_parallelFilter[row].processTasks(-1); - -/* Apply SAO on last row of CUs, because we always apply SAO on row[X-1] */ -if (m_param->bEnableSAO) -{ -FrameData* encData = m_frameFilter.m_parallelFilter[row].m_encData; -SAOParam* saoParam = encData->m_saoParam; -for(uint32_t col = 0; col < numCols; col++) -{ -// NOTE: must use processSaoUnitCu(), it include TQBypass logic - m_frameFilter.m_parallelFilter[row].processSaoUnitCu(saoParam, col); -} -} - -// Process border extension on last row -for(uint32_t col = 0; col < numCols; col++) -{ -// m_reconColCount will be set in processPostCu() -m_frameFilter.m_parallelFilter[row].processPostCu(col); -} -} -} - if (m_param->bEnableWavefront) { if (row >= m_filterRowDelay) diff -r 375ce77b8c35 -r 9e0fe9704998 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 28 16:06:55 2015 -0600 +++ b/source/encoder/framefilter.cppThu Dec 31 10:29:43 2015 -0600 @@ -460,6 +460,43 @@ // SAO: was integrate into encode loop SAOParam* saoParam = encData.m_saoParam; +/* Processing left block Deblock with current threading */ +{ +/* stop threading on current row */ +m_parallelFilter[row].waitForExit(); + +/* Check to avoid previous row process slower than current row */ +if (row >= 1) +X265_CHECK(m_parallelFilter[row - 1].m_lastDeblocked.get() == (int)ParallelFilter::numCols, "previous row not finish"); + +m_parallelFilter[row].m_allowedCol.set(ParallelFilter::numCols); +m_parallelFilter[row].p
[x265] [PATCH update Release mode warning] optimize sync logic to improve speed on preset medium and below
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451588073 21600 # Node ID fb58aa76a9ae1eb6b0bac37f34bc418ba2ce941c # Parent 375ce77b8c35ae332bf51085f6d26044d55ae264 optimize sync logic to improve speed on preset medium and below --- source/encoder/frameencoder.cpp | 59 +- source/encoder/framefilter.cpp | 36 +++ source/encoder/framefilter.h|2 +- 3 files changed, 45 insertions(+), 52 deletions(-) diff -r 375ce77b8c35 -r fb58aa76a9ae source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 28 16:06:55 2015 -0600 +++ b/source/encoder/frameencoder.cpp Thu Dec 31 12:54:33 2015 -0600 @@ -1204,64 +1204,21 @@ rowCoder.finishSlice(); /* Processing left Deblock block with current threading */ -if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1)) +if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 2)) { /* TODO: Multiple Threading */ -/* Check to avoid previous row process slower than current row */ -if (row >= 2) + +/* Check conditional to start previous row process with current threading */ +if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols) { -int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get(); -while(prevCol != (int)numCols) -prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.waitForChange(prevCol); +/* stop threading on current row and restart it */ +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); +m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); } -m_frameFilter.m_parallelFilter[row - 1].waitForExit(); -m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); -m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); } /* trigger row-wise loop filters */ -if (row == m_numRows - 1) -{ -/* TODO: Early start last row */ -if (m_param->bEnableLoopFilter | m_param->bEnableSAO) -{ -if (m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() != (int)numCols) -x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n"); - -// avoid race on last row and last column -if (row >= 1) -{ -int prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get(); -while(prevCol != (int)numCols) -prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.waitForChange(prevCol); -} - -/* NOTE: Last Row not execute before, so didn't need wait */ -m_frameFilter.m_parallelFilter[row].waitForExit(); -m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols); -m_frameFilter.m_parallelFilter[row].processTasks(-1); - -/* Apply SAO on last row of CUs, because we always apply SAO on row[X-1] */ -if (m_param->bEnableSAO) -{ -FrameData* encData = m_frameFilter.m_parallelFilter[row].m_encData; -SAOParam* saoParam = encData->m_saoParam; -for(uint32_t col = 0; col < numCols; col++) -{ -// NOTE: must use processSaoUnitCu(), it include TQBypass logic - m_frameFilter.m_parallelFilter[row].processSaoUnitCu(saoParam, col); -} -} - -// Process border extension on last row -for(uint32_t col = 0; col < numCols; col++) -{ -// m_reconColCount will be set in processPostCu() -m_frameFilter.m_parallelFilter[row].processPostCu(col); -} -} -} - if (m_param->bEnableWavefront) { if (row >= m_filterRowDelay) diff -r 375ce77b8c35 -r fb58aa76a9ae source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 28 16:06:55 2015 -0600 +++ b/source/encoder/framefilter.cppThu Dec 31 12:54:33 2015 -0600 @@ -460,6 +460,42 @@ // SAO: was integrate into encode loop SAOParam* saoParam = encData.m_saoParam; +/* Processing left block Deblock with current threading */ +{ +/* stop threading on current row */ +m_parallelFilter[row].waitForExit(); + +/* Check to avoid previous row process slower than current row */ +X265_CHECK((row < 1) || m_parallelFilter[row - 1].m_lastDeblocked.get() == (int)ParallelFilter::numCols, "previous row not finish"); + +m_parallelFilter[row].m_allowedCol.set(ParallelFilter::numCols); +m_parallelFilter[row].processTasks(-1); +
[x265] [PATCH] fix non-determination output after apply new preset parameter sets
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451594594 21600 # Node ID e5abd9b22f52fb8800068a2c699aea1648b69f54 # Parent fb58aa76a9ae1eb6b0bac37f34bc418ba2ce941c fix non-determination output after apply new preset parameter sets --- source/encoder/analysis.cpp | 16 1 files changed, 16 insertions(+), 0 deletions(-) diff -r fb58aa76a9ae -r e5abd9b22f52 source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Thu Dec 31 12:54:33 2015 -0600 +++ b/source/encoder/analysis.cpp Thu Dec 31 14:43:14 2015 -0600 @@ -836,6 +836,14 @@ splitData[2].initSplitCUData(); splitData[3].initSplitCUData(); +// avoid uninitialize value in below reference +if (m_param->limitModes) +{ +md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0 +md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1 +md.pred[PRED_2Nx2N].rdCost = 0; +} + /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */ if (mightNotSplit && depth >= minDepth) { @@ -1304,6 +1312,14 @@ bool foundSkip = false; bool splitIntra = true; +// avoid uninitialize value in below reference +if (m_param->limitModes) +{ +md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0 +md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1 +md.pred[PRED_2Nx2N].rdCost = 0; +} + if (m_param->analysisMode == X265_ANALYSIS_LOAD) { uint8_t* reuseDepth = _reuseInterDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451520182 21600 # Node ID 717cb31ed9931513bb0851f0e6c68af868b5ad45 # Parent 75d1c62d8f0c517dda37ac89f401faa308d60f24 asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223) --- source/common/x86/pixel-a.asm | 327 ++--- source/test/pixelharness.cpp | 25 +++- 2 files changed, 103 insertions(+), 249 deletions(-) diff -r 75d1c62d8f0c -r 717cb31ed993 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Thu Dec 24 13:58:32 2015 +0530 +++ b/source/common/x86/pixel-a.asm Wed Dec 30 18:03:02 2015 -0600 @@ -8154,92 +8154,57 @@ ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ; INIT_XMM sse2 -cglobal downShift_16, 7,7,3 -movdm0, r6d; m0 = shift +cglobal downShift_16, 4,7,3 +mov r4d, r4m +mov r5d, r5m +movdm0, r6m; m0 = shift add r1, r1 + dec r5d .loopH: xor r6, r6 + .loopW: movum1, [r0 + r6 * 2] -movum2, [r0 + r6 * 2 + 16] +movum2, [r0 + r6 * 2 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswbm1, m2 movu[r2 + r6], m1 -add r6, 16 +add r6, mmsize cmp r6d, r4d -jl .loopW +jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d -jnz .loopH - -;processing last row of every frame [To handle width which not a multiple of 16] - +jnz.loopH + +;processing last row of every frame [To handle width which not a multiple of 16] +; r4d must be more than or equal to 16(mmsize) .loop16: +movum1, [r0 + (r4 - mmsize) * 2] +movum2, [r0 + (r4 - mmsize) * 2 + mmsize] +psrlw m1, m0 +psrlw m2, m0 +packuswbm1, m2 +movu[r2 + r4 - mmsize], m1 + +sub r4d, mmsize +jz .end +cmp r4d, mmsize +jge.loop16 + +; process partial pixels movum1, [r0] -movum2, [r0 + 16] +movum2, [r0 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswbm1, m2 movu[r2], m1 -add r0, 2 * mmsize -add r2, mmsize -sub r4d, 16 -jz .end -cmp r4d, 15 -jg .loop16 - -cmp r4d, 8 -jl .process4 -movum1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movh[r2], m1 - -add r0, mmsize -add r2, 8 -sub r4d, 8 -jz .end - -.process4: -cmp r4d, 4 -jl .process2 -movhm1,[r0] -psrlw m1, m0 -packuswbm1, m1 -movd[r2], m1 - -add r0, 8 -add r2, 4 -sub r4d, 4 -jz .end - -.process2: -cmp r4d, 2 -jl .process1 -movdm1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movdr6, m1 -mov [r2], r6w - -add r0, 4 -add r2, 2 -sub r4d, 2 -jz .end - -.process1: -movdm1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movdr3, m1 -mov [r2], r3b .end: RET @@ -8248,12 +8213,16 @@ ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;- INIT_YMM avx2 -cglobal downShift_16, 6,7,3 +cglobal downShift_16, 4,7,3 +mov r4d, r4m +mov r5d, r5m movdxm0, r6m; m0 = shift add r1d, r1d + dec r5d .loopH: xor r6, r6 + .loopW: movum1, [r0 + r6 * 2 + 0] movum2, [r0 + r6 * 2 + 32] @@ -8265,92 +8234,39 @@ add r6d, mmsize cmp r6d, r4d -jl .loopW +jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d -jnz .loopH - -; processing last row of every frame [To handle width which not a multiple of 32] -mov r6d, r4d -and r4d, 31 -shr r6d, 5 +jnz.loopH + +; processing last row of every frame [To handle width which not a multiple of 32] .loop32: -movum1, [r0] -movum2, [r0 + 32] +movum1, [r0 + (r4 - mmsize) * 2] +movum2, [r0 + (r4 - mmsize) * 2 + mmsize] psrlw m1, xm0
[x265] [PATCH] asm: rewrite 16bpp partial pixels process code on upShift and downShift
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451520081 21600 # Node ID bee61d779c63523a5cb59919eff34a01c1d19a51 # Parent 75d1c62d8f0c517dda37ac89f401faa308d60f24 asm: rewrite 16bpp partial pixels process code on upShift and downShift --- source/common/x86/pixel-a.asm | 327 ++--- source/test/pixelharness.cpp | 25 +++- 2 files changed, 103 insertions(+), 249 deletions(-) diff -r 75d1c62d8f0c -r bee61d779c63 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Thu Dec 24 13:58:32 2015 +0530 +++ b/source/common/x86/pixel-a.asm Wed Dec 30 18:01:21 2015 -0600 @@ -8154,92 +8154,57 @@ ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ; INIT_XMM sse2 -cglobal downShift_16, 7,7,3 -movdm0, r6d; m0 = shift +cglobal downShift_16, 4,7,3 +mov r4d, r4m +mov r5d, r5m +movdm0, r6m; m0 = shift add r1, r1 + dec r5d .loopH: xor r6, r6 + .loopW: movum1, [r0 + r6 * 2] -movum2, [r0 + r6 * 2 + 16] +movum2, [r0 + r6 * 2 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswbm1, m2 movu[r2 + r6], m1 -add r6, 16 +add r6, mmsize cmp r6d, r4d -jl .loopW +jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d -jnz .loopH - -;processing last row of every frame [To handle width which not a multiple of 16] - +jnz.loopH + +;processing last row of every frame [To handle width which not a multiple of 16] +; r4d must be more than or equal to 16(mmsize) .loop16: +movum1, [r0 + (r4 - mmsize) * 2] +movum2, [r0 + (r4 - mmsize) * 2 + mmsize] +psrlw m1, m0 +psrlw m2, m0 +packuswbm1, m2 +movu[r2 + r4 - mmsize], m1 + +sub r4d, mmsize +jz .end +cmp r4d, mmsize +jge.loop16 + +; process partial pixels movum1, [r0] -movum2, [r0 + 16] +movum2, [r0 + mmsize] psrlw m1, m0 psrlw m2, m0 packuswbm1, m2 movu[r2], m1 -add r0, 2 * mmsize -add r2, mmsize -sub r4d, 16 -jz .end -cmp r4d, 15 -jg .loop16 - -cmp r4d, 8 -jl .process4 -movum1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movh[r2], m1 - -add r0, mmsize -add r2, 8 -sub r4d, 8 -jz .end - -.process4: -cmp r4d, 4 -jl .process2 -movhm1,[r0] -psrlw m1, m0 -packuswbm1, m1 -movd[r2], m1 - -add r0, 8 -add r2, 4 -sub r4d, 4 -jz .end - -.process2: -cmp r4d, 2 -jl .process1 -movdm1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movdr6, m1 -mov [r2], r6w - -add r0, 4 -add r2, 2 -sub r4d, 2 -jz .end - -.process1: -movdm1, [r0] -psrlw m1, m0 -packuswbm1, m1 -movdr3, m1 -mov [r2], r3b .end: RET @@ -8248,12 +8213,16 @@ ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ;- INIT_YMM avx2 -cglobal downShift_16, 6,7,3 +cglobal downShift_16, 4,7,3 +mov r4d, r4m +mov r5d, r5m movdxm0, r6m; m0 = shift add r1d, r1d + dec r5d .loopH: xor r6, r6 + .loopW: movum1, [r0 + r6 * 2 + 0] movum2, [r0 + r6 * 2 + 32] @@ -8265,92 +8234,39 @@ add r6d, mmsize cmp r6d, r4d -jl .loopW +jl .loopW ; move to next row add r0, r1 add r2, r3 dec r5d -jnz .loopH - -; processing last row of every frame [To handle width which not a multiple of 32] -mov r6d, r4d -and r4d, 31 -shr r6d, 5 +jnz.loopH + +; processing last row of every frame [To handle width which not a multiple of 32] .loop32: -movum1, [r0] -movum2, [r0 + 32] +movum1, [r0 + (r4 - mmsize) * 2] +movum2, [r0 + (r4 - mmsize) * 2 + mmsize] psrlw m1, xm0 psrlw m
[x265] [PATCH 1 of 2] fix weight memory free bug on I400
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1451340412 21600 # Node ID f8daf6c38fec27b8b01f3a01df19c5a2252382b9 # Parent 1471e4e433f71f39a6eb93507c349fb8539e fix weight memory free bug on I400 --- source/encoder/reference.cpp |6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff -r 1471e4e433cc -r f8daf6c38fec source/encoder/reference.cpp --- a/source/encoder/reference.cpp Tue Dec 22 19:25:58 2015 +0530 +++ b/source/encoder/reference.cpp Mon Dec 28 16:06:52 2015 -0600 @@ -40,9 +40,9 @@ MotionReference::~MotionReference() { -X265_FREE(weightBuffer[0]); -X265_FREE(weightBuffer[1]); -X265_FREE(weightBuffer[2]); +if (weightBuffer[0]) X265_FREE_ZERO(weightBuffer[0]); +if (weightBuffer[1]) X265_FREE_ZERO(weightBuffer[1]); +if (weightBuffer[2]) X265_FREE_ZERO(weightBuffer[2]); } int MotionReference::init(PicYuv* recPic, WeightParam *wp, const x265_param& p) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 3] do border extension on CU level and new counter for reconColCount
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1450727186 21600 # Node ID bcc6d005cd852c043413d1f90aca05366acec514 # Parent 942587f1ab4484ce69a818ce9c8adc59c38fe239 do border extension on CU level and new counter for reconColCount --- source/common/frame.cpp | 11 +++ source/common/frame.h |2 + source/encoder/dpb.cpp |6 ++ source/encoder/frameencoder.cpp | 13 +++ source/encoder/framefilter.cpp | 186 +++ source/encoder/framefilter.h| 17 6 files changed, 177 insertions(+), 58 deletions(-) diff -r 942587f1ab44 -r bcc6d005cd85 source/common/frame.cpp --- a/source/common/frame.cpp Wed Dec 16 09:08:00 2015 +0530 +++ b/source/common/frame.cpp Mon Dec 21 13:46:26 2015 -0600 @@ -33,6 +33,7 @@ m_bChromaExtended = false; m_lowresInit = false; m_reconRowCount.set(0); +m_reconColCount = NULL; m_countRefEncoders = 0; m_encData = NULL; m_reconPic = NULL; @@ -51,6 +52,10 @@ if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) && m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode)) { +X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized"); +m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize; +m_reconColCount = new ThreadSafeInteger[m_numRows]; + if (quantOffsets) { int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol; @@ -122,6 +127,12 @@ m_reconPic = NULL; } +if (m_reconColCount) +{ +delete[] m_reconColCount; +m_reconColCount = NULL; +} + if (m_quantOffsets) { delete[] m_quantOffsets; diff -r 942587f1ab44 -r bcc6d005cd85 source/common/frame.h --- a/source/common/frame.h Wed Dec 16 09:08:00 2015 +0530 +++ b/source/common/frame.h Mon Dec 21 13:46:26 2015 -0600 @@ -63,6 +63,8 @@ /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */ ThreadSafeInteger m_reconRowCount; // count of CTU rows completely reconstructed and extended for motion reference +ThreadSafeInteger* m_reconColCount; // count of CTU cols completely reconstructed and extended for motion reference +int32_tm_numRows; volatile uint32_t m_countRefEncoders; // count of FrameEncoder threads monitoring m_reconRowCount Frame* m_next; // PicList doubly linked list pointers diff -r 942587f1ab44 -r bcc6d005cd85 source/encoder/dpb.cpp --- a/source/encoder/dpb.cppWed Dec 16 09:08:00 2015 +0530 +++ b/source/encoder/dpb.cppMon Dec 21 13:46:26 2015 -0600 @@ -74,6 +74,12 @@ curFrame->m_reconRowCount.set(0); curFrame->m_bChromaExtended = false; +// Reset column counter +X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure"); +X265_CHECK(curFrame->m_numRows > 0, "curFrame->m_numRows check failure"); +for(int32_t col = 0; col < curFrame->m_numRows; col++) +curFrame->m_reconColCount[col].set(0); + // iterator is invalidated by remove, restart scan m_picList.remove(*curFrame); iterFrame = m_picList.first(); diff -r 942587f1ab44 -r bcc6d005cd85 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Wed Dec 16 09:08:00 2015 +0530 +++ b/source/encoder/frameencoder.cpp Mon Dec 21 13:46:26 2015 -0600 @@ -1139,6 +1139,12 @@ m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1); } } +// Both Loopfilter and SAO Disabled +else +{ +m_frameFilter.m_parallelFilter[row].processPostCu(col); +m_frame->m_reconColCount[row].set(col); +} if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) @@ -1247,6 +1253,13 @@ m_frameFilter.m_parallelFilter[row].processSaoUnitCu(saoParam, col); } } + +// Process border extension on last row +for(uint32_t col = 0; col < numCols; col++) +{ +m_frameFilter.m_parallelFilter[row].processPostCu(col); +} +m_frame->m_reconColCount[row].set(numCols - 1); } } diff -r 942587f1ab44 -r bcc6d005cd85 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Dec 16 09:08:00 2015 +0530 +++ b/source/encoder/framefilter.cppMon Dec 21 13:46:26 2015 -0600 @@ -37,6 +37,8 @@ uint32_t FrameFilter::ParallelFilter::numCols = 0; uint32_t FrameFilter::Paral
[x265] [PATCH 3 of 3] reduce pointer operators on I400
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1450728715 21600 # Node ID 3e9e45fd692d79806e53d69c2ed7a20b3e24e671 # Parent d8c3eded1440670bde63e2fb5bec0e80ff6e2d67 reduce pointer operators on I400 --- source/encoder/framefilter.cpp |7 --- 1 files changed, 4 insertions(+), 3 deletions(-) diff -r d8c3eded1440 -r 3e9e45fd692d source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 21 13:46:28 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 21 14:11:55 2015 -0600 @@ -507,7 +507,7 @@ const intptr_t stride = reconPic->m_stride; const intptr_t strideC = reconPic->m_strideC; pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr); -// // MUST BE check I400 since m_picOrg uninitialize in that case +// MUST BE check I400 since m_picOrg uninitialize in that case pixel *pixU = (m_param->internalCsp != X265_CSP_I400) ? reconPic->getCbAddr(lineStartCUAddr) : NULL; pixel *pixV = (m_param->internalCsp != X265_CSP_I400) ? reconPic->getCrAddr(lineStartCUAddr) : NULL; int copySizeY = realW; @@ -560,13 +560,14 @@ if (row == FrameFilter::ParallelFilter::numRows - 1) { pixY += (realH - 1) * stride; -pixU += ((realH >> vChromaShift) - 1) * strideC; -pixV += ((realH >> vChromaShift) - 1) * strideC; for (uint32_t y = 0; y < lumaMarginY; y++) memcpy(pixY + (y + 1) * stride, pixY, copySizeY * sizeof(pixel)); if (m_param->internalCsp != X265_CSP_I400) { +pixU += ((realH >> vChromaShift) - 1) * strideC; +pixV += ((realH >> vChromaShift) - 1) * strideC; + for (uint32_t y = 0; y < chromaMarginY; y++) { memcpy(pixU + (y + 1) * strideC, pixU, copySizeC * sizeof(pixel)); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 3] simplify logic on setting reconColCount[] and fix bug in case that disable both Deblock and Sao
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1450727188 21600 # Node ID d8c3eded1440670bde63e2fb5bec0e80ff6e2d67 # Parent bcc6d005cd852c043413d1f90aca05366acec514 simplify logic on setting reconColCount[] and fix bug in case that disable both Deblock and Sao --- source/encoder/frameencoder.cpp |5 +- source/encoder/framefilter.cpp | 119 +-- source/encoder/framefilter.h|8 +- 3 files changed, 108 insertions(+), 24 deletions(-) diff -r bcc6d005cd85 -r d8c3eded1440 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 21 13:46:26 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 21 13:46:28 2015 -0600 @@ -1142,8 +1142,7 @@ // Both Loopfilter and SAO Disabled else { -m_frameFilter.m_parallelFilter[row].processPostCu(col); -m_frame->m_reconColCount[row].set(col); +m_frameFilter.processPostCu(row, col); } if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && @@ -1257,9 +1256,9 @@ // Process border extension on last row for(uint32_t col = 0; col < numCols; col++) { +// m_reconColCount will be set in processPostCu() m_frameFilter.m_parallelFilter[row].processPostCu(col); } -m_frame->m_reconColCount[row].set(numCols - 1); } } diff -r bcc6d005cd85 -r d8c3eded1440 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 21 13:46:26 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 21 13:46:28 2015 -0600 @@ -231,6 +231,9 @@ // NOTE: MUST BE delay a row when Deblock enabled, the Deblock will modify above pixels in Horizon pass void FrameFilter::ParallelFilter::processPostCu(uint32_t col) const { +// Update finished CU cursor +m_frame->m_reconColCount[m_row].set(col); + // shortcut path for non-border area if ((col != 0) & (col != numCols - 1) & (m_row != 0) & (m_row != numRows - 1)) return; @@ -352,10 +355,7 @@ // When SAO Disable, setting column counter here if ((!m_param->bEnableSAO) & (m_row >= 1)) -{ m_prevRow->processPostCu(col - 1); -m_frame->m_reconColCount[m_row - 1].set(col - 1); -} } if (m_param->bEnableSAO) @@ -378,7 +378,6 @@ // Must delay 1 row to avoid thread data race conflict m_prevRow->processSaoUnitCu(saoParam, col - 3); m_prevRow->processPostCu(col - 3); -m_frame->m_reconColCount[m_row - 1].set(col - 3); } } @@ -398,10 +397,7 @@ // When SAO Disable, setting column counter here if ((!m_param->bEnableSAO) & (m_row >= 1)) -{ m_prevRow->processPostCu(numCols - 1); -m_frame->m_reconColCount[m_row - 1].set(numCols - 1); -} } // TODO: move processPostCu() into processSaoUnitCu() @@ -456,7 +452,7 @@ if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO) { -processRowPost(row); +processPostRow(row); return; } FrameData& encData = *m_frame->m_encData; @@ -467,7 +463,7 @@ // this row of CTUs has been encoded if (row > 0) -processRowPost(row - 1); +processPostRow(row - 1); if (row == m_numRows - 1) { @@ -482,16 +478,105 @@ m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame); } -processRowPost(row); +processPostRow(row); } } -uint32_t FrameFilter::getCUHeight(int rowNum) const +// NOTE: This version for case that Disable both Deblock and Sao +void FrameFilter::processPostCu(uint32_t row, uint32_t col) const { -return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize; +// Update finished CU cursor +m_frame->m_reconColCount[row].set(col); + +// shortcut path for non-border area +if ((col != 0) & (col != FrameFilter::ParallelFilter::numCols - 1) & (row != 0) & (row != FrameFilter::ParallelFilter::numRows - 1)) +return; + +PicYuv *reconPic = m_frame->m_reconPic; +const uint32_t rowAddr = row * FrameFilter::ParallelFilter::numCols; +const uint32_t lineStartCUAddr = rowAddr + col; +const int realH = FrameFilter::ParallelFilter::getCUHeight(row); +const int realW = FrameFilter::ParallelFilter::getCUWidth(col); + +const uint32_t lumaMarginX = reconPic->m_lumaMarginX; +const uint32_t lumaMarginY = reconPic->m_lumaMarginY; +const uint32_t chromaMarginX = reconPic->m_chromaMa
[x265] [PATCH 1 of 2] asm: reduce saoCuStatsBO code size by remove offset field
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449793234 21600 # Node ID 6135ca57edd80ce619a39c542823e6cd09533b1b # Parent ec3f657507db94e88ab45496bd260c3ec1e917a0 asm: reduce saoCuStatsBO code size by remove offset field --- source/common/x86/loopfilter.asm | 11 +++ 1 files changed, 7 insertions(+), 4 deletions(-) diff -r ec3f657507db -r 6135ca57edd8 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Thu Dec 10 18:20:31 2015 -0600 +++ b/source/common/x86/loopfilter.asm Thu Dec 10 18:20:34 2015 -0600 @@ -2000,11 +2000,14 @@ cglobal saoCuStatsBO, 7,12,2 movam0, [pb_124] xor r7d, r7d +add r5, 4 +add r6, 4 .loopH: mov r10, r0 mov r11, r1 mov r9d, r3d + .loopL: movum1, [r11] psrlw m1, 1 ; rec[x] >> boShift @@ -2014,16 +2017,16 @@ %rep 16 pextrb r7d, m1, x movsx r8d, word [r10 + x*2] ; diff[x] -inc dword [r6 + r7 + 4]; count[classIdx]++ -add [r5 + r7 + 4], r8d ; stats[classIdx] += (fenc[x] - rec[x]); +inc dword [r6 + r7]; count[classIdx]++ +add [r5 + r7], r8d ; stats[classIdx] += (fenc[x] - rec[x]); dec r9d -jz .next +jz .next %assign x x+1 %endrep add r10, 16*2 add r11, 16 -jmp .loopL +jmp.loopL .next: add r0, 64*2; MAX_CU_SIZE ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 2 of 2] asm: improve saoCuStatsBO by split loop path and replace PEXTRB
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449793237 21600 # Node ID cf0ac10f6dffecc9c9096163f570365c1b0a4ffa # Parent 6135ca57edd80ce619a39c542823e6cd09533b1b asm: improve saoCuStatsBO by split loop path and replace PEXTRB --- source/common/x86/loopfilter.asm | 35 --- 1 files changed, 28 insertions(+), 7 deletions(-) diff -r 6135ca57edd8 -r cf0ac10f6dff source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Thu Dec 10 18:20:34 2015 -0600 +++ b/source/common/x86/loopfilter.asm Thu Dec 10 18:20:37 2015 -0600 @@ -1997,14 +1997,13 @@ ;-- %if ARCH_X86_64 INIT_XMM sse4 -cglobal saoCuStatsBO, 7,12,2 +cglobal saoCuStatsBO, 7,13,2 movam0, [pb_124] -xor r7d, r7d add r5, 4 add r6, 4 .loopH: -mov r10, r0 +mov r12, r0 mov r11, r1 mov r9d, r3d @@ -2013,10 +2012,32 @@ psrlw m1, 1 ; rec[x] >> boShift pandm1, m0 +cmp r9d, 8 +jle.proc8 + +movqr10, m1 %assign x 0 -%rep 16 -pextrb r7d, m1, x -movsx r8d, word [r10 + x*2] ; diff[x] +%rep 8 +movzx r7d, r10b +shr r10, 8 + +movsx r8d, word [r12 + x*2] ; diff[x] +inc dword [r6 + r7]; count[classIdx]++ +add [r5 + r7], r8d ; stats[classIdx] += (fenc[x] - rec[x]); +%assign x x+1 +%endrep +movhlps m1, m1 +sub r9d, 8 +add r12, 8*2 + +.proc8: +movqr10, m1 +%assign x 0 +%rep 8 +movzx r7d, r10b +shr r10, 8 + +movsx r8d, word [r12 + x*2] ; diff[x] inc dword [r6 + r7]; count[classIdx]++ add [r5 + r7], r8d ; stats[classIdx] += (fenc[x] - rec[x]); dec r9d @@ -2024,7 +2045,7 @@ %assign x x+1 %endrep -add r10, 16*2 +add r12, 8*2 add r11, 16 jmp.loopL ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 4 of 5] asm: AVX2 version of saoCuStatsE2, (138180c -> 44906c)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449698985 21600 # Node ID a5f81208a7ba8043261c009582995c48a1c40f37 # Parent 7ad2050bc2aaa8083b4e2de14d5846e5074b7b73 asm: AVX2 version of saoCuStatsE2, (138180c -> 44906c) --- source/common/x86/asm-primitives.cpp |1 + source/common/x86/loopfilter.asm | 707 ++ 2 files changed, 453 insertions(+), 255 deletions(-) diff -r 7ad2050bc2aa -r a5f81208a7ba source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:43 2015 -0600 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:45 2015 -0600 @@ -3638,6 +3638,7 @@ p.propagateCost = PFX(mbtree_propagate_cost_avx2); p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2); p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2); +p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2); if (cpuMask & X265_CPU_BMI2) { diff -r 7ad2050bc2aa -r a5f81208a7ba source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Wed Dec 09 16:09:43 2015 -0600 +++ b/source/common/x86/loopfilter.asm Wed Dec 09 16:09:45 2015 -0600 @@ -2467,7 +2467,7 @@ ; m[1-4] free in here -; get current process mask +; get current process group mask mov r7d, 16 mov r8d, r6d cmp r6d, r7d @@ -2592,248 +2592,6 @@ RET %endif ; ARCH_X86_64 -%if ARCH_X86_64 -;; argument registers used - -; r0- src -; r1- srcStep -; r2- offset -; r3- tcP -; r4- tcQ - -INIT_XMM sse4 -cglobal pelFilterLumaStrong_H, 5,7,10 -mov r1, r2 -neg r3d -neg r4d -neg r1 - -lea r5, [r2 * 3] -lea r6, [r1 * 3] - -pmovzxbwm4, [r0]; src[0] -pmovzxbwm3, [r0 + r1] ; src[-offset] -pmovzxbwm2, [r0 + r1 * 2] ; src[-offset * 2] -pmovzxbwm1, [r0 + r6] ; src[-offset * 3] -pmovzxbwm0, [r0 + r1 * 4] ; src[-offset * 4] -pmovzxbwm5, [r0 + r2] ; src[offset] -pmovzxbwm6, [r0 + r2 * 2] ; src[offset * 2] -pmovzxbwm7, [r0 + r5] ; src[offset * 3] - -paddw m0, m0 ; m0*2 -movam8, m2 -paddw m8, m3 ; m2 + m3 -paddw m8, m4 ; m2 + m3 + m4 -movam9, m8 -paddw m9, m9 ; 2*m2 + 2*m3 + 2*m4 -paddw m8, m1 ; m2 + m3 + m4 + m1 -paddw m0, m8 ; 2*m0 + m2+ m3 + m4 + m1 -paddw m9, m1 -paddw m0, m1 -paddw m9, m5 ; m1 + 2*m2 + 2*m3 + 2*m4 + m5 -paddw m0, m1 ; 2*m0 + 3*m1 + m2 + m3 + m4 - -punpcklqdq m0, m9 -punpcklqdq m1, m3 - -paddw m3, m4 -movam9, m5 -paddw m9, m6 -paddw m7, m7 ; 2*m7 -paddw m9, m3 ; m3 + m4 + m5 + m6 -movam3, m9 -paddw m3, m3 ; 2*m3 + 2*m4 + 2*m5 + 2*m6 -paddw m7, m9 ; 2*m7 + m3 + m4 + m5 + m6 -paddw m7, m6 -psubw m3, m6 ; 2*m3 + 2*m4 + 2*m5 + m6 -paddw m7, m6 ; m3 + m4 + m5 + 3*m6 + 2*m7 -paddw m3, m2 ; m2 + 2*m3 + 2*m4 + 2*m5 + m6 - -punpcklqdq m9, m8 -punpcklqdq m3, m7 -punpcklqdq m5, m2 -punpcklqdq m4, m6 - -movdm7, r3d ; -tcP -movdm2, r4d ; -tcQ -pshufb m7, [pb_01] -pshufb m2, [pb_01] -movam6, m2 -punpcklqdq m6, m7 - -paddw m0, [pw_4] -paddw m3, [pw_4] -paddw m9, [pw_2] - -psraw m0, 3 -psraw m3, 3 -psraw m9, 2 - -psubw m0, m1 -psubw m3, m4 -psubw m9, m5 - -pmaxsw m0, m7 -pmaxsw m3, m2 -pmaxsw m9, m6 -psignw m7, [pw_n1] -psignw m2, [pw_n1] -psignw m6, [pw_n1] -pminsw m0, m7 -pminsw m3, m2 -pminsw m9, m6 - -paddw m0, m1 -paddw m3, m4 -paddw m9, m5 -packuswbm0, m0 -packuswbm3, m9 - -movd[r0 + r6], m0 -pextrd [r0 + r1], m0, 1 -movd[r0], m3 -pextrd [r0 + r2 * 2], m3, 1 -pextrd [r0 + r2 * 1], m3, 2 -pextrd [r0 + r1 * 2], m3, 3 -RET - -INIT_XMM sse4 -cglobal pelFilterLumaStrong_V, 5,5,10 -neg r3d -neg r4d -lea r2, [r1 * 3] - -movh
[x265] [PATCH 3 of 5] asm: AVX2 version of saoCuStatsE1, (131370c -> 41189c)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449698983 21600 # Node ID 7ad2050bc2aaa8083b4e2de14d5846e5074b7b73 # Parent 2073ed3429fe81af14b46aca6a14e0b34405f615 asm: AVX2 version of saoCuStatsE1, (131370c -> 41189c) --- source/common/x86/asm-primitives.cpp |1 + source/common/x86/loopfilter.asm | 184 +- source/encoder/sao.cpp |2 + 3 files changed, 184 insertions(+), 3 deletions(-) diff -r 2073ed3429fe -r 7ad2050bc2aa source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:40 2015 -0600 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:43 2015 -0600 @@ -3637,6 +3637,7 @@ p.frameInitLowres = PFX(frame_init_lowres_core_avx2); p.propagateCost = PFX(mbtree_propagate_cost_avx2); p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2); +p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2); if (cpuMask & X265_CPU_BMI2) { diff -r 2073ed3429fe -r 7ad2050bc2aa source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Wed Dec 09 16:09:40 2015 -0600 +++ b/source/common/x86/loopfilter.asm Wed Dec 09 16:09:43 2015 -0600 @@ -2238,7 +2238,7 @@ pmaddwd m4, m5, m2 paddd m15, m4 -sub r5d, 16 +sub r5d, r7d jle.next add r0, 16*2 @@ -2299,7 +2299,7 @@ phaddd xm3, xm1 phaddd xm2, xm4 phaddd xm3, xm2 -psubd xm3, xm0, xm3 ; negtive to compensate PMADDWD sign algorithm problem +psubd xm3, xm0, xm3 ; negtive for compensate PMADDWD sign algorithm problem ; sum stats[4] only HADDD xm5, xm6 @@ -2321,7 +2321,6 @@ cglobal saoCuStatsE1, 4,12,8,0-32; Stack: 5 of stats and 5 of count mov r5d, r5m mov r4d, r4m -mov r11d, r5d ; clear internal temporary buffer pxorm0, m0 @@ -2412,6 +2411,185 @@ mov r6d, [rsp + 5 * 2 + 4 * 4] add [r1 + 4 * 4], r6d RET + + +INIT_YMM avx2 +cglobal saoCuStatsE1, 4,13,16 ; Stack: 5 of stats and 5 of count +mov r5d, r5m +mov r4d, r4m + +; clear internal temporary buffer +pxorxm6, xm6; count[0] +pxorxm7, xm7; count[1] +pxorxm8, xm8; count[2] +pxorxm9, xm9; count[3] +pxorxm10, xm10 ; count[4] +pxorxm11, xm11 ; stats[0] +pxorxm12, xm12 ; stats[1] +pxorxm13, xm13 ; stats[2] +pxorxm14, xm14 ; stats[3] +pxorxm15, xm15 ; stats[4] +movam0, [pb_128] +movam5, [pb_1] + +; save unavailable bound pixel +push qword [r3 + r4] + +; unavailable mask +lea r12, [pb_movemask_32 + 32] + +.loopH: +mov r6d, r4d +mov r9, r0 +mov r10, r1 +mov r11, r3 + +.loopW: +movuxm1, [r10] +movuxm2, [r10 + r2] + +; signDown +pxorxm1, xm0 +pxorxm2, xm0 +pcmpgtb xm3, xm1, xm2 +pcmpgtb xm2, xm1 +pandxm3, xm5 +por xm2, xm3 +psignb xm3, xm2, xm0 ; -signDown + +; edgeType +movuxm4, [r11] +paddb xm4, [pb_2] +paddb xm2, xm4 + +; update upBuff1 (must be delay, above code modify memory[r11]) +movu[r11], xm3 + +; m[1-4] free in here + +; get current process mask +mov r7d, 16 +mov r8d, r6d +cmp r6d, r7d +cmovge r8d, r7d +neg r8 +movuxm1, [r12 + r8] + +; tmp_count[edgeType]++ +; tmp_stats[edgeType] += (fenc[x] - rec[x]) +pxorxm3, xm3 +por xm1, xm2; apply unavailable pixel mask +movum4, [r9]; up to 14bits + +pcmpeqb xm3, xm1, xm3 +psubb xm6, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m11, m3 + +pcmpeqb xm3, xm1, xm5 +psubb xm7, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m12, m3 + +pcmpeqb xm3, xm1, [pb_2] +psubb xm8, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m13, m3 + +pcmpeqb xm3, xm1, [pb_3] +psubb xm9, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m14, m3 + +pcmpeqb xm3, xm1, [pb_4] +psubb xm10, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m15, m3 + +sub r6d, r7d +jle.next + +add r9,
[x265] [PATCH 2 of 5] asm: AVX2 version of saoCuStatsE0, (133572c -> 47575c)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449698980 21600 # Node ID 2073ed3429fe81af14b46aca6a14e0b34405f615 # Parent 6e39e10b195e56c54c27050c727521c39ef29125 asm: AVX2 version of saoCuStatsE0, (133572c -> 47575c) --- source/common/x86/asm-primitives.cpp |1 + source/common/x86/const-a.asm|4 +- source/common/x86/loopfilter.asm | 194 -- 3 files changed, 190 insertions(+), 9 deletions(-) diff -r 6e39e10b195e -r 2073ed3429fe source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 09 15:24:37 2015 -0600 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:40 2015 -0600 @@ -3636,6 +3636,7 @@ p.frameInitLowres = PFX(frame_init_lowres_core_avx2); p.propagateCost = PFX(mbtree_propagate_cost_avx2); +p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2); if (cpuMask & X265_CPU_BMI2) { diff -r 6e39e10b195e -r 2073ed3429fe source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Wed Dec 09 15:24:37 2015 -0600 +++ b/source/common/x86/const-a.asm Wed Dec 09 16:09:40 2015 -0600 @@ -32,10 +32,10 @@ ;; 8-bit constants -const pb_0, times 16 db 0 +const pb_0, times 32 db 0 const pb_1, times 32 db 1 const pb_2, times 32 db 2 -const pb_3, times 16 db 3 +const pb_3, times 32 db 3 const pb_4, times 32 db 4 const pb_8, times 32 db 8 const pb_15,times 32 db 15 diff -r 6e39e10b195e -r 2073ed3429fe source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Wed Dec 09 15:24:37 2015 -0600 +++ b/source/common/x86/loopfilter.asm Wed Dec 09 16:09:40 2015 -0600 @@ -32,22 +32,22 @@ pb_31: times 32 db 31 pb_124: times 32 db 124 pb_15: times 32 db 15 -pb_movemask_32: times 32 db 0x00 - times 32 db 0xFF SECTION .text cextern pb_1 +cextern pb_2 +cextern pb_3 +cextern pb_4 cextern pb_01 cextern pb_128 -cextern pb_2 +cextern pw_1 +cextern pw_n1 cextern pw_2 +cextern pw_4 cextern pw_pixel_max cextern pb_movemask -cextern pw_1 +cextern pb_movemask_32 cextern hmul_16p -cextern pb_4 -cextern pw_4 -cextern pw_n1 ; @@ -2131,6 +2131,186 @@ mov r6d, [rsp + 5 * 2 + 4 * 4] add [r9 + 4 * 4], r6d RET + + +;--- +; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) +;--- +INIT_YMM avx2 +; spending rbp register to avoid x86inc stack alignment problem +cglobal saoCuStatsE0, 3,11,16 +mov r3d, r3m +mov r4d, r4m +mov r9, r5mp + +; clear internal temporary buffer +pxorxm6, xm6; count[0] +pxorxm7, xm7; count[1] +pxorxm8, xm8; count[2] +pxorxm9, xm9; count[3] +pxorxm10, xm10 ; count[4] +pxorxm11, xm11 ; stats[0] +pxorxm12, xm12 ; stats[1] +pxorxm13, xm13 ; stats[2] +pxorxm14, xm14 ; stats[3] +pxorxm15, xm15 ; stats[4] +xor r7d, r7d + +; correct stride for diff[] and rec +mov r6d, r3d +and r6d, ~15 +sub r2, r6 +lea r8, [(r6 - 64) * 2] ; 64 = MAX_CU_SIZE +lea r10, [pb_movemask_32 + 32] + +.loopH: +mov r5d, r3d + +; calculate signLeft +mov r7b, [r1] +sub r7b, [r1 - 1] +setar7b +setbr6b +sub r7b, r6b +neg r7b +pinsrb xm0, r7d, 15 + +.loopL: +movam4, [pb_128]; lower performance, but we haven't enough register for stats[] +movuxm3, [r1] +movuxm2, [r1 + 1] + +pxorxm1, xm3, xm4 +pxorxm2, xm4 +pcmpgtb xm3, xm1, xm2 +pcmpgtb xm2, xm1 +pandxm3, [pb_1] +por xm2, xm3; signRight + +palignr xm3, xm2, xm0, 15 +psignb xm3, xm4; signLeft + +movaxm0, xm2 +paddb xm2, xm3 +paddb xm2, [pb_2] ; edgeType + +; get current process mask +mov r7d, 16 +mov r6d, r5d +cmp r5d, r7d +cmovge r6d, r7d +neg
[x265] [PATCH 5 of 5] asm: AVX2 version of saoCuStatsE3, (136881c -> 45126c)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449698989 21600 # Node ID 32eb64163b1d3c5d7dceb6bfedb84b61e160094e # Parent a5f81208a7ba8043261c009582995c48a1c40f37 asm: AVX2 version of saoCuStatsE3, (136881c -> 45126c) --- source/common/x86/asm-primitives.cpp |1 + source/common/x86/loopfilter.asm | 186 ++ 2 files changed, 187 insertions(+), 0 deletions(-) diff -r a5f81208a7ba -r 32eb64163b1d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:45 2015 -0600 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 16:09:49 2015 -0600 @@ -3639,6 +3639,7 @@ p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2); p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2); p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2); +p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2); if (cpuMask & X265_CPU_BMI2) { diff -r a5f81208a7ba -r 32eb64163b1d source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Wed Dec 09 16:09:45 2015 -0600 +++ b/source/common/x86/loopfilter.asm Wed Dec 09 16:09:49 2015 -0600 @@ -3074,6 +3074,192 @@ mov r6d, [rsp + 5 * 2 + 4 * 4] add [r1 + 4 * 4], r6d RET + + +INIT_YMM avx2 +cglobal saoCuStatsE3, 4,10,16 ; Stack: 5 of stats and 5 of count +mov r4d, r4m +mov r5d, r5m + +; clear internal temporary buffer +pxorxm6, xm6; count[0] +pxorxm7, xm7; count[1] +pxorxm8, xm8; count[2] +pxorxm9, xm9; count[3] +pxorxm10, xm10 ; count[4] +pxorxm11, xm11 ; stats[0] +pxorxm12, xm12 ; stats[1] +pxorxm13, xm13 ; stats[2] +pxorxm14, xm14 ; stats[3] +pxorxm15, xm15 ; stats[4] +movam0, [pb_128] + +; unavailable mask +lea r9, [pb_movemask_32 + 32] +push qword [r3 + r4] + +.loopH: +mov r6d, r4d + +.loopW: +movum1, [r1] +movum2, [r1 + r2 - 1] + +; signDown +; stats[edgeType] +pxorxm1, xm0 +pxorxm2, xm0 +pcmpgtb xm3, xm1, xm2 +pandxm3, [pb_1] +pcmpgtb xm2, xm1 +por xm2, xm3 +pxorxm3, xm3 +psubb xm3, xm2 + +; edgeType +movuxm4, [r3] +paddb xm4, [pb_2] +paddb xm2, xm4 + +; update upBuff1 +movu[r3 - 1], xm3 + +; m[1-4] free in here + +; get current process group mask +mov r7d, 16 +mov r8d, r6d +cmp r6d, r7d +cmovge r8d, r7d +neg r8 +movuxm1, [r9 + r8] + +; tmp_count[edgeType]++ +; tmp_stats[edgeType] += (fenc[x] - rec[x]) +pxorxm3, xm3 +por xm1, xm2; apply unavailable pixel mask +movum4, [r0]; up to 14bits + +pcmpeqb xm3, xm1, xm3 +psubb xm6, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m11, m3 + +pcmpeqb xm3, xm1, [pb_1] +psubb xm7, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m12, m3 + +pcmpeqb xm3, xm1, [pb_2] +psubb xm8, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m13, m3 + +pcmpeqb xm3, xm1, [pb_3] +psubb xm9, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m14, m3 + +pcmpeqb xm3, xm1, [pb_4] +psubb xm10, xm3 +pmovsxbwm2, xm3 +pmaddwd m3, m4, m2 +paddd m15, m3 + +sub r6d, r7d +jle.next + +add r0, 16*2 +add r1, 16 +add r3, 16 +jmp.loopW + +.next: +; restore pointer upBuff1 +mov r6d, r4d +and r6d, ~15 +neg r6 ; MUST BE 64-bits, it is Negtive + +; move to next row + +; move back to start point +add r3, r6 + +; adjust with stride +lea r0, [r0 + (r6 + 64) * 2]; 64 = MAX_CU_SIZE +add r1, r2 +add r1, r6 + +dec r5d +jg .loopH + +; restore unavailable pixels +pop qword [r3 + r4] + +; sum to global buffer +mov r1, r6m +mov r0, r7m + +; sum into word +; WARNING: There have a ovberflow bug on case Block64x64 with ALL pixels are SAME type (HM algorithm never pass Block64x64 into here) +pxorxm0, xm0 +psadbw xm1, xm6, xm0 +psadbw xm2, xm7, xm0 +psadbw xm3, xm8, xm0 +psadbw xm4, xm9, xm0 +psadbw x
[x265] [PATCH 1 of 5] csv: remove reduce pointer slice to avoid compiler warning
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449696277 21600 # Node ID 6e39e10b195e56c54c27050c727521c39ef29125 # Parent 2ed13ab590f0d7e1d6a0d1be445a37303ad36ed5 csv: remove reduce pointer slice to avoid compiler warning --- source/encoder/encoder.cpp |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff -r 2ed13ab590f0 -r 6e39e10b195e source/encoder/encoder.cpp --- a/source/encoder/encoder.cppMon Dec 07 11:49:08 2015 +0530 +++ b/source/encoder/encoder.cppWed Dec 09 15:24:37 2015 -0600 @@ -1272,7 +1272,6 @@ if (frameStats) { -Slice* slice = curFrame->m_encData->m_slice; const int picOrderCntLSB = (slice->m_poc - slice->m_lastIDR + (1 << BITS_FOR_POC)) % (1 << BITS_FOR_POC); frameStats->encoderOrder = m_outputCount++; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: AVX2 version of saoCuStatsE0, (133572c -> 47575c)
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449617231 21600 # Node ID 244ae08b286714f714f6476c6e71dfe4f734b4ea # Parent bc3da6a276cc043ca8034d7ff00dcafb9dcd17d4 asm: AVX2 version of saoCuStatsE0, (133572c -> 47575c) --- source/common/x86/asm-primitives.cpp |1 + source/common/x86/const-a.asm|4 +- source/common/x86/loopfilter.asm | 194 -- 3 files changed, 190 insertions(+), 9 deletions(-) diff -r bc3da6a276cc -r 244ae08b2867 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Dec 07 17:54:02 2015 -0600 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 08 17:27:11 2015 -0600 @@ -3636,6 +3636,7 @@ p.frameInitLowres = PFX(frame_init_lowres_core_avx2); p.propagateCost = PFX(mbtree_propagate_cost_avx2); +p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2); if (cpuMask & X265_CPU_BMI2) { diff -r bc3da6a276cc -r 244ae08b2867 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Mon Dec 07 17:54:02 2015 -0600 +++ b/source/common/x86/const-a.asm Tue Dec 08 17:27:11 2015 -0600 @@ -32,10 +32,10 @@ ;; 8-bit constants -const pb_0, times 16 db 0 +const pb_0, times 32 db 0 const pb_1, times 32 db 1 const pb_2, times 32 db 2 -const pb_3, times 16 db 3 +const pb_3, times 32 db 3 const pb_4, times 32 db 4 const pb_8, times 32 db 8 const pb_15,times 32 db 15 diff -r bc3da6a276cc -r 244ae08b2867 source/common/x86/loopfilter.asm --- a/source/common/x86/loopfilter.asm Mon Dec 07 17:54:02 2015 -0600 +++ b/source/common/x86/loopfilter.asm Tue Dec 08 17:27:11 2015 -0600 @@ -32,22 +32,22 @@ pb_31: times 32 db 31 pb_124: times 32 db 124 pb_15: times 32 db 15 -pb_movemask_32: times 32 db 0x00 - times 32 db 0xFF SECTION .text cextern pb_1 +cextern pb_2 +cextern pb_3 +cextern pb_4 cextern pb_01 cextern pb_128 -cextern pb_2 +cextern pw_1 +cextern pw_n1 cextern pw_2 +cextern pw_4 cextern pw_pixel_max cextern pb_movemask -cextern pw_1 +cextern pb_movemask_32 cextern hmul_16p -cextern pb_4 -cextern pw_4 -cextern pw_n1 ; @@ -2131,6 +2131,186 @@ mov r6d, [rsp + 5 * 2 + 4 * 4] add [r9 + 4 * 4], r6d RET + + +;--- +; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count) +;--- +INIT_YMM avx2 +; spending rbp register to avoid x86inc stack alignment problem +cglobal saoCuStatsE0, 3,11,16 +mov r3d, r3m +mov r4d, r4m +mov r9, r5mp + +; clear internal temporary buffer +pxorxm6, xm6; count[0] +pxorxm7, xm7; count[1] +pxorxm8, xm8; count[2] +pxorxm9, xm9; count[3] +pxorxm10, xm10 ; count[4] +pxorxm11, xm11 ; stats[0] +pxorxm12, xm12 ; stats[1] +pxorxm13, xm13 ; stats[2] +pxorxm14, xm14 ; stats[3] +pxorxm15, xm15 ; stats[4] +xor r7d, r7d + +; correct stride for diff[] and rec +mov r6d, r3d +and r6d, ~15 +sub r2, r6 +lea r8, [(r6 - 64) * 2] ; 64 = MAX_CU_SIZE +lea r10, [pb_movemask_32 + 32] + +.loopH: +mov r5d, r3d + +; calculate signLeft +mov r7b, [r1] +sub r7b, [r1 - 1] +setar7b +setbr6b +sub r7b, r6b +neg r7b +pinsrb xm0, r7d, 15 + +.loopL: +movam4, [pb_128]; lower performance, but we haven't enough register for stats[] +movuxm3, [r1] +movuxm2, [r1 + 1] + +pxorxm1, xm3, xm4 +pxorxm2, xm4 +pcmpgtb xm3, xm1, xm2 +pcmpgtb xm2, xm1 +pandxm3, [pb_1] +por xm2, xm3; signRight + +palignr xm3, xm2, xm0, 15 +psignb xm3, xm4; signLeft + +movaxm0, xm2 +paddb xm2, xm3 +paddb xm2, [pb_2] ; edgeType + +; get current process mask +mov r7d, 16 +mov r6d, r5d +cmp r5d, r7d +cmovge r6d, r7d +neg
[x265] [PATCH] csv: remove reduce pointer slice to avoid compiler warning
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449509663 21600 # Node ID b7ca5ebd7fcdcd4af0ef5ae567e88c04b7694e46 # Parent 2ed13ab590f0d7e1d6a0d1be445a37303ad36ed5 csv: remove reduce pointer slice to avoid compiler warning --- source/encoder/encoder.cpp |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff -r 2ed13ab590f0 -r b7ca5ebd7fcd source/encoder/encoder.cpp --- a/source/encoder/encoder.cppMon Dec 07 11:49:08 2015 +0530 +++ b/source/encoder/encoder.cppMon Dec 07 11:34:23 2015 -0600 @@ -1272,7 +1272,6 @@ if (frameStats) { -Slice* slice = curFrame->m_encData->m_slice; const int picOrderCntLSB = (slice->m_poc - slice->m_lastIDR + (1 << BITS_FOR_POC)) % (1 << BITS_FOR_POC); frameStats->encoderOrder = m_outputCount++; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 24 of 24] sao: correct counter control logic on m_lastDeblocked & fix bug in lossless mode on latest row
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449532442 21600 # Node ID bc3da6a276cc043ca8034d7ff00dcafb9dcd17d4 # Parent a2d20844f461fcd91c1bd966f59b6cfb08358e33 sao: correct counter control logic on m_lastDeblocked & fix bug in lossless mode on latest row --- source/encoder/frameencoder.cpp | 15 ++- source/encoder/framefilter.cpp | 13 - 2 files changed, 14 insertions(+), 14 deletions(-) diff -r a2d20844f461 -r bc3da6a276cc source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:06:48 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 07 17:54:02 2015 -0600 @@ -1205,7 +1205,7 @@ if (row >= 2) { int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get(); -while(prevCol != (int)numCols - 1) +while(prevCol != (int)numCols) prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.waitForChange(prevCol); } m_frameFilter.m_parallelFilter[row - 1].waitForExit(); @@ -1219,14 +1219,14 @@ /* TODO: Early start last row */ if (m_param->bEnableLoopFilter | m_param->bEnableSAO) { -if (m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() != (int)numCols - 1) +if (m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() != (int)numCols) x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n"); // avoid race on last row and last column if (row >= 1) { int prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get(); -while(prevCol != (int)numCols - 1) +while(prevCol != (int)numCols) prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.waitForChange(prevCol); } @@ -1235,18 +1235,15 @@ m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols); m_frameFilter.m_parallelFilter[row].processTasks(-1); -/* Apply SAO on last row of CUs */ +/* Apply SAO on last row of CUs, because we always apply SAO on row[X-1] */ if (m_param->bEnableSAO) { FrameData* encData = m_frameFilter.m_parallelFilter[row].m_encData; SAOParam* saoParam = encData->m_saoParam; for(uint32_t col = 0; col < numCols; col++) { -if (saoParam->bSaoFlag[0]) - m_frameFilter.m_parallelFilter[row].m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], row, col); - -if (saoParam->bSaoFlag[1]) - m_frameFilter.m_parallelFilter[row].m_sao.processSaoUnitCuChroma(saoParam->ctuParam, row, col); +// NOTE: must use processSaoUnitCu(), it include TQBypass logic + m_frameFilter.m_parallelFilter[row].processSaoUnitCu(saoParam, col); } } } diff -r a2d20844f461 -r bc3da6a276cc source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:48 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 17:54:02 2015 -0600 @@ -134,8 +134,8 @@ /* restore original YUV samples to recon after SAO (if lossless) */ static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx) { -int size = cu->m_log2CUSize[absPartIdx] - 2; -uint32_t cuAddr = cu->m_cuAddr; +const int size = cu->m_log2CUSize[absPartIdx] - 2; +const uint32_t cuAddr = cu->m_cuAddr; PicYuv* reconPic = frame.m_reconPic; PicYuv* fencPic = frame.m_fencPic; @@ -151,7 +151,7 @@ pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx); pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx); -int csp = fencPic->m_picCsp; +const int csp = fencPic->m_picCsp; primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC); primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC); } @@ -213,6 +213,7 @@ uint32_t cuAddr = m_rowAddr + col; const CUData* ctu = m_encData->getPicCTU(cuAddr); +assert(m_frameEncoder->m_frame->m_reconPic == m_encData->m_reconPic); origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frameEncoder->m_frame); } } @@ -272,7 +273,7 @@ } } -m_lastDeblocked.set(col - 1); +m_lastDeblocked.set(col); } m_lastCol.incr(); } @@ -303,12 +304,14 @@ // Process Previous Rows SAO CU if (m_row >= 1 && numCols >= 3) m_prevRow->processSaoUnitCu(saoParam, nu
[x265] [PATCH 06 of 24] sao: merge tmpU1 and tmpU2 into tmpU, and copy these above reference pixels in every row based thread
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511563 21600 # Node ID b2c551a2927e2ee0852d1983da9226ef3c2c1871 # Parent c68eec7fb242748363ec985937b20ed1aff73f02 sao: merge tmpU1 and tmpU2 into tmpU, and copy these above reference pixels in every row based thread --- source/encoder/frameencoder.cpp |7 ++ source/encoder/framefilter.cpp | 44 +++--- source/encoder/framefilter.h|3 ++ source/encoder/sao.cpp | 39 + source/encoder/sao.h|3 +- 5 files changed, 53 insertions(+), 43 deletions(-) diff -r c68eec7fb242 -r b2c551a2927e source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:06:00 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:06:03 2015 -0600 @@ -1124,6 +1124,13 @@ } } +/* Case of DEBLOCK Disable and SAO Enable */ +if (!m_param->bEnableLoopFilter && m_param->bEnableSAO) +{ +PicYuv* reconPic = curEncData.m_reconPic; +m_frameFilter.m_parallelFilter[row].copySaoAboveRef(reconPic, cuAddr, col); +} + if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) { diff -r c68eec7fb242 -r b2c551a2927e source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:00 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:03 2015 -0600 @@ -69,7 +69,7 @@ if (m_param->bEnableSsim) m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3)); -if (m_param->bEnableLoopFilter) +if (m_param->bEnableLoopFilter | m_param->bEnableSAO) m_parallelFilter = new ParallelFilter[numRows]; if (m_parallelFilter) @@ -91,6 +91,7 @@ for(int row = 0; row < numRows; row++) { +m_parallelFilter[row].m_param = m_param; m_parallelFilter[row].m_rowAddr = row * numCols; m_parallelFilter[row].m_frameEncoder = m_frameEncoder; } @@ -117,17 +118,39 @@ m_parallelFilter[row].m_encData = frame->m_encData; } -// Reset SAO global/common statistics +// Reset SAO common statistics if (m_param->bEnableSAO) m_parallelFilter[0].m_sao.resetStats(); } } +void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col) +{ +// Copy SAO Top Reference Pixels +int ctuWidth = g_maxCUSize; +const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride); + +// Luma +memcpy(_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel)); +X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected"); + +// Chroma +ctuWidth >>= m_sao.m_hChromaShift; + +const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC); +const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC); +memcpy(_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel)); +memcpy(_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel)); + +X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected"); +} + // NOTE: Single Threading only void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/) { const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms; const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap; +PicYuv* reconPic = m_encData->m_reconPic; const int colStart = m_lastCol.get(); // TODO: Waiting previous row finish or simple clip on it? const int colEnd = m_allowedCol.get(); @@ -146,6 +169,9 @@ { const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1); deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR); + +if (m_param->bEnableSAO) +copySaoAboveRef(reconPic, cuAddr - 1, col - 1); } m_lastCol.incr(); } @@ -155,6 +181,9 @@ const uint32_t cuAddr = m_rowAddr + numCols - 1; const CUData* ctuPrev = m_encData->getPicCTU(cuAddr); deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR); + +if (m_param->bEnableSAO) +copySaoAboveRef(reconPic, cuAddr, numCols - 1); } } @@ -507,23 +536,12 @@ SAOParam* saoParam = encData.m_saoParam; if (saoParam->bSaoFlag[0]) -{ m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0); -if (row != m_numRows - 1) -{ -memcpy(m_pa
[x265] [PATCH 08 of 24] remove reduce SAO context initialize
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511568 21600 # Node ID 4f7ead5981eb585a553559906911f9aa788c8ffc # Parent 7d6d5444aa9704b092bcf5ff23c5c50773e08f72 remove reduce SAO context initialize --- source/encoder/framefilter.cpp |4 1 files changed, 0 insertions(+), 4 deletions(-) diff -r 7d6d5444aa97 -r 4f7ead5981eb source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:05 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:08 2015 -0600 @@ -218,10 +218,6 @@ SAOParam* saoParam = encData.m_saoParam; if (m_param->bEnableSAO) { - m_parallelFilter[row].m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext); - m_parallelFilter[row].m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext); - m_parallelFilter[row].m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext); - m_parallelFilter[row].m_sao.rdoSaoUnitRow(saoParam, row); // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug? ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 05 of 24] move SAO into class ParallelFilter and modify it to row based
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511560 21600 # Node ID c68eec7fb242748363ec985937b20ed1aff73f02 # Parent 3542d3abd018491d6ad67a79b0e6d05b604d3818 move SAO into class ParallelFilter and modify it to row based --- source/common/common.h |1 + source/encoder/frameencoder.cpp | 36 +++--- source/encoder/framefilter.cpp | 95 +- source/encoder/framefilter.h| 14 +++--- source/encoder/sao.cpp | 81 - source/encoder/sao.h|7 ++- 6 files changed, 151 insertions(+), 83 deletions(-) diff -r 3542d3abd018 -r c68eec7fb242 source/common/common.h --- a/source/common/common.hMon Dec 07 12:05:57 2015 -0600 +++ b/source/common/common.hMon Dec 07 12:06:00 2015 -0600 @@ -215,6 +215,7 @@ #define X265_MALLOC(type, count)(type*)x265_malloc(sizeof(type) * (count)) #define X265_FREE(ptr) x265_free(ptr) +#define X265_FREE_ZERO(ptr) x265_free(ptr); (ptr) = NULL #define CHECKED_MALLOC(var, type, count) \ { \ var = (type*)x265_malloc(sizeof(type) * (count)); \ diff -r 3542d3abd018 -r c68eec7fb242 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:05:57 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:06:00 2015 -0600 @@ -1093,7 +1093,7 @@ /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */ if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) -m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); + m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); /* Deblock with idle threading */ if (m_param->bEnableLoopFilter) @@ -1103,24 +1103,24 @@ if (row > 0) { // Waitting last threading finish -m_frameFilter.m_pdeblock[row - 1].waitForExit(); +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); // Processing new group -const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col); -m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol); -m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1); +const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col) : col); +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol); +m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1); } // Last Row may start early if (row == m_numRows - 1) { // Waitting last threading finish -m_frameFilter.m_pdeblock[row].waitForExit(); +m_frameFilter.m_parallelFilter[row].waitForExit(); // Processing last row -const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 1].m_lastCol.get(), (int)col) : col); -m_frameFilter.m_pdeblock[row].m_allowedCol.set(allowCol); -m_frameFilter.m_pdeblock[row].tryBondPeers(*this, 1); +const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col) : col); +m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol); +m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1); } } @@ -1188,17 +1188,17 @@ if (m_param->bEnableLoopFilter & (row > 0)) { /* TODO: Multiple Threading */ -m_frameFilter.m_pdeblock[row - 1].waitForExit(); +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); /* Check to avoid previous row process slower than current row */ if (row >= 2) { -int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(); +int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(); while(prevCol != (int)numCols) -prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol); +prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.waitForChange(prevCol); } -m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols); -m_frameFilter.m_pdeblock[row - 1].processTasks(-1); +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); +m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); } /* trigger row-wise loop filters */ @@ -1217,12 +1217,12 @@ /* TODO: Early start last row */ if (m_param->bEnable
[x265] [PATCH 09 of 24] nits: cleanup unused code
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511570 21600 # Node ID 63809496ca0caa713f09fed495520f13006833cb # Parent 4f7ead5981eb585a553559906911f9aa788c8ffc nits: cleanup unused code --- source/encoder/sao.cpp |4 +--- 1 files changed, 1 insertions(+), 3 deletions(-) diff -r 4f7ead5981eb -r 63809496ca0c source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:08 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:10 2015 -0600 @@ -623,8 +623,6 @@ rec += stride; } -rec -= (stride << 1); - for (int idxX = 0; idxX < m_numCuInWidth; idxX++) { addr = idxY * m_numCuInWidth + idxX; @@ -658,7 +656,7 @@ } else if (idxX != (m_numCuInWidth - 1)) { -rec = plane ? reconPic->getChromaAddr(plane, addr) : reconPic->getLumaAddr(addr); +rec = reconPic->getPlaneAddr(plane, addr); for (int i = 0; i < ctuHeight + 1; i++) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 03 of 24] improve Parallel Deblock last row process
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511555 21600 # Node ID 2c6a7879eca09d28a8bcc467c0186f40b387fdd6 # Parent 6726fba8beab483428949404d6ffbd4f345e9149 improve Parallel Deblock last row process --- source/encoder/frameencoder.cpp | 13 + 1 files changed, 13 insertions(+), 0 deletions(-) diff -r 6726fba8beab -r 2c6a7879eca0 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:05:52 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:05:55 2015 -0600 @@ -1110,6 +1110,18 @@ m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol); m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1); } + +// Last Row may start early +if (row == m_numRows - 1) +{ +// Waitting last threading finish +m_frameFilter.m_pdeblock[row].waitForExit(); + +// Processing last row +const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 1].m_lastCol.get(), (int)col) : col); +m_frameFilter.m_pdeblock[row].m_allowedCol.set(allowCol); +m_frameFilter.m_pdeblock[row].tryBondPeers(*this, 1); +} } if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && @@ -1208,6 +1220,7 @@ X265_CHECK(m_frameFilter.m_pdeblock[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed"); /* NOTE: Last Row not execute before, so didn't need wait */ +m_frameFilter.m_pdeblock[row].waitForExit(); m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols); m_frameFilter.m_pdeblock[row].processTasks(-1); } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 02 of 24] Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511552 21600 # Node ID 6726fba8beab483428949404d6ffbd4f345e9149 # Parent 4f6b549198244291d25d6d2a0208e212960237c1 Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism --- source/common/threading.h | 18 +++ source/encoder/frameencoder.cpp | 53 +++- source/encoder/framefilter.cpp | 102 ++ source/encoder/framefilter.h| 44 - 4 files changed, 179 insertions(+), 38 deletions(-) diff -r 4f6b54919824 -r 6726fba8beab source/common/threading.h --- a/source/common/threading.h Mon Dec 07 12:05:49 2015 -0600 +++ b/source/common/threading.h Mon Dec 07 12:05:52 2015 -0600 @@ -205,6 +205,15 @@ return ret; } +int getIncr(int n = 1) +{ +EnterCriticalSection(_cs); +int ret = m_val; +m_val += n; +LeaveCriticalSection(_cs); +return ret; +} + void set(int newval) { EnterCriticalSection(_cs); @@ -394,6 +403,15 @@ return ret; } +int getIncr(int n = 1) +{ +pthread_mutex_lock(_mutex); +int ret = m_val; +m_val += n; +pthread_mutex_unlock(_mutex); +return ret; +} + void set(int newval) { pthread_mutex_lock(_mutex); diff -r 4f6b54919824 -r 6726fba8beab source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:05:49 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:05:52 2015 -0600 @@ -124,7 +124,7 @@ m_pool = NULL; } -m_frameFilter.init(top, this, numRows); +m_frameFilter.init(top, this, numRows, numCols); // initialize HRD parameters of SPS if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) @@ -857,7 +857,7 @@ // Called by worker threads void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) { -uint32_t row = (uint32_t)intRow; +const uint32_t row = (uint32_t)intRow; CTURow& curRow = m_rows[row]; tld.analysis.m_param = m_param; @@ -899,7 +899,7 @@ { ProfileScopeEvent(encodeCTU); -uint32_t col = curRow.completed; +const uint32_t col = curRow.completed; const uint32_t cuAddr = lineStartCUAddr + col; CUData* ctu = curEncData.getPicCTU(cuAddr); ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp); @@ -1089,10 +1089,29 @@ } } +// TODO: move Deblock and SAO to before VBV check + /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */ if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); +/* Deblock with idle threading */ +if (m_param->bEnableLoopFilter) +{ +// TODO: Multiple Threading +// Delay ONE row to avoid Intra Prediction Conflict +if (row > 0) +{ +// Waitting last threading finish +m_frameFilter.m_pdeblock[row - 1].waitForExit(); + +// Processing new group +const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col); +m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol); +m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1); +} +} + if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) { @@ -1153,6 +1172,23 @@ if (m_param->bEnableWavefront) { +/* Processing left Deblock block with current threading */ +if (m_param->bEnableLoopFilter & (row > 0)) +{ +/* TODO: Multiple Threading */ +m_frameFilter.m_pdeblock[row - 1].waitForExit(); + +/* Check to avoid previous row process slower than current row */ +if (row >= 2) +{ +int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(); +while(prevCol != (int)numCols) +prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol); +} +m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols); +m_frameFilter.m_pdeblock[row - 1].processTasks(-1); +} + /* trigger row-wise loop filters */ if (row >= m_filterRowDelay) { @@ -1163,8 +1199,19 @@ enqueueRowFilter(0); tryWakeOne(); } + if (row == m_numRows - 1) { +/* TODO: Early start last row */ +if (m_param->bEnableLoopFilter) +{
[x265] [PATCH 13 of 24] sao: avoid thread conflict on offsetEo and offsetBo
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511581 21600 # Node ID 25506e562e457a5e538cdd3c7b5ab974aa75f68d # Parent 47deea6d4e4f4aba6fdf6c210835bb843c9a4d83 sao: avoid thread conflict on offsetEo and offsetBo --- source/encoder/framefilter.cpp | 12 +--- source/encoder/sao.cpp | 38 -- source/encoder/sao.h |4 ++-- 3 files changed, 27 insertions(+), 27 deletions(-) diff -r 47deea6d4e4f -r 25506e562e45 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:19 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:21 2015 -0600 @@ -543,18 +543,16 @@ SAOParam* saoParam = encData.m_saoParam; uint32_t numCols = encData.m_slice->m_sps->numCuInWidth; -if (saoParam->bSaoFlag[0]) +for(uint32_t col = 0; col < numCols; col++) { -for(uint32_t col = 0; col < numCols; col++) +if (saoParam->bSaoFlag[0]) m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[0], row, col, 0); -} -if (saoParam->bSaoFlag[1]) -{ -for(uint32_t col = 0; col < numCols; col++) +if (saoParam->bSaoFlag[1]) +{ m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[1], row, col, 1); -for(uint32_t col = 0; col < numCols; col++) m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[2], row, col, 2); +} } if (encData.m_slice->m_pps->bTransquantBypassEnabled) diff -r 47deea6d4e4f -r 25506e562e45 source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:19 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:21 2015 -0600 @@ -325,6 +325,8 @@ tmpL = m_tmpL1[plane]; tmpU = &(m_tmpU[plane][lpelx]); +int8_t* offsetEo = m_offsetEo[plane]; + switch (typeIdx) { case SAO_EO_0: // dir: - @@ -343,7 +345,7 @@ int edgeType = signRight + signLeft + 2; signLeft = -signRight; -rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; +rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; } rec += stride; @@ -368,7 +370,7 @@ row1LastPxl = rec[stride + ctuWidth - 1]; } -primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, signLeft1, stride); +primitives.saoCuOrgE0(rec, offsetEo, ctuWidth, signLeft1, stride); if (!lpelx) { @@ -407,7 +409,7 @@ int edgeType = signDown + upBuff1[x] + 2; upBuff1[x] = -signDown; -rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; +rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; } rec += stride; @@ -420,11 +422,11 @@ int diff = (endY - startY) % 2; for (y = startY; y < endY - diff; y += 2) { -primitives.saoCuOrgE1_2Rows(rec, upBuff1, m_offsetEo, stride, ctuWidth); +primitives.saoCuOrgE1_2Rows(rec, upBuff1, offsetEo, stride, ctuWidth); rec += 2 * stride; } if (diff & 1) -primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth); +primitives.saoCuOrgE1(rec, upBuff1, offsetEo, stride, ctuWidth); } break; @@ -474,7 +476,7 @@ int8_t signDown = signOf(rec[x] - rec[x + stride + 1]); int edgeType = signDown + upBuff1[x] + 2; upBufft[x + 1] = -signDown; - rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; + rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; } std::swap(upBuff1, upBufft); @@ -488,7 +490,7 @@ { int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]); -primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, m_offsetEo, endX - startX, stride); +primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride); upBufft[startX] = iSignDown2; @@ -520,14 +522,14 @@ int8_t signDown = signOf(rec[x] - tmpL[y + 1]); int edgeType = signDown + upBuff1[x] + 2; upBuff1[x - 1] = -signDown; -rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; +rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; for (x = startX + 1; x < endX; x++) { signDown = signOf(rec[x] - rec[x + stride - 1]); edgeType = signDown + upBuff1[x] + 2; up
[x265] [PATCH 15 of 24] sao: cleanup unused processSaoUnitRow()
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511587 21600 # Node ID 42a01b5f1c7cb70522e9d516c22af69c0b1e8cf0 # Parent 86558049b77fa9838a4048229cc76ee9587356b8 sao: cleanup unused processSaoUnitRow() --- source/encoder/sao.cpp | 70 1 files changed, 0 insertions(+), 70 deletions(-) diff -r 86558049b77f -r 42a01b5f1c7c source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:24 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:27 2015 -0600 @@ -603,76 +603,6 @@ } } -/* Process SAO all units */ -void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane) -{ -PicYuv* reconPic = m_frame->m_reconPic; -intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; -uint32_t picWidth = m_param->sourceWidth; -int ctuWidth = g_maxCUSize; -int ctuHeight = g_maxCUSize; - -if (plane) -{ -picWidth >>= m_hChromaShift; -ctuWidth >>= m_hChromaShift; -ctuHeight >>= m_vChromaShift; -} - -int addr = idxY * m_numCuInWidth; -pixel* rec = reconPic->getPlaneAddr(plane, addr); - -for (int i = 0; i < ctuHeight + 1; i++) -{ -m_tmpL1[plane][i] = rec[0]; -rec += stride; -} - -for (int idxX = 0; idxX < m_numCuInWidth; idxX++) -{ -addr = idxY * m_numCuInWidth + idxX; - -bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT; -int typeIdx = ctuParam[addr].typeIdx; - -if (idxX != (m_numCuInWidth - 1)) -{ -rec = reconPic->getPlaneAddr(plane, addr); -for (int i = 0; i < ctuHeight + 1; i++) -{ -m_tmpL2[plane][i] = rec[ctuWidth - 1]; -rec += stride; -} -} - -if (typeIdx >= 0) -{ -if (!mergeLeftFlag) -{ -if (typeIdx == SAO_BO) -{ -memset(m_offsetBo[plane], 0, sizeof(m_offsetBo[0])); - -for (int i = 0; i < SAO_NUM_OFFSET; i++) -m_offsetBo[plane][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); -} -else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) -{ -int offset[NUM_EDGETYPE]; -offset[0] = 0; -for (int i = 0; i < SAO_NUM_OFFSET; i++) -offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC; - -for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) -m_offsetEo[plane][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; -} -} -processSaoCu(addr, typeIdx, plane); -} -std::swap(m_tmpL1[plane], m_tmpL2[plane]); -} -} - /* Process SAO unit */ void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 19 of 24] sao: share fast lookup table m_clipTable
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511598 21600 # Node ID f023dda04a265ff507746af68c213e61303805f6 # Parent 04c67fe19c5fbf025ecddbdd59f6d71f73539f58 sao: share fast lookup table m_clipTable --- source/encoder/sao.cpp | 39 +++ 1 files changed, 23 insertions(+), 16 deletions(-) diff -r 04c67fe19c5f -r f023dda04a26 source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:35 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:38 2015 -0600 @@ -115,9 +115,6 @@ const pixel rangeExt = maxY >> 1; int numCtu = m_numCuInWidth * m_numCuInHeight; -CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt); - - for (int i = 0; i < 3; i++) { CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1); @@ -133,25 +130,32 @@ { CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu); CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu); + +CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt); +m_clipTable = &(m_clipTableBase[rangeExt]); + +// Share with fast clip lookup table +if (initCommon) +{ +for (int i = 0; i < rangeExt; i++) +m_clipTableBase[i] = 0; + +for (int i = 0; i < maxY; i++) +m_clipTable[i] = (pixel)i; + +for (int i = maxY; i < maxY + rangeExt; i++) +m_clipTable[i] = maxY; +} } else { // must initialize these common pointer outside of function m_countPreDblk = NULL; m_offsetOrgPreDblk = NULL; +m_clipTableBase = NULL; +m_clipTable = NULL; } -m_clipTable = &(m_clipTableBase[rangeExt]); - -for (int i = 0; i < rangeExt; i++) -m_clipTableBase[i] = 0; - -for (int i = 0; i < maxY; i++) -m_clipTable[i] = (pixel)i; - -for (int i = maxY; i < maxY + rangeExt; i++) -m_clipTable[i] = maxY; - return true; fail: @@ -162,15 +166,17 @@ { X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk"); X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk"); +X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase"); +X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable"); m_countPreDblk = root->m_countPreDblk; m_offsetOrgPreDblk = root->m_offsetOrgPreDblk; +m_clipTableBase = root->m_clipTableBase; // Unnecessary +m_clipTable = root->m_clipTable; } void SAO::destroy(int destoryCommon) { -X265_FREE_ZERO(m_clipTableBase); - for (int i = 0; i < 3; i++) { if (m_tmpL1[i]) @@ -196,6 +202,7 @@ { X265_FREE_ZERO(m_countPreDblk); X265_FREE_ZERO(m_offsetOrgPreDblk); +X265_FREE_ZERO(m_clipTableBase); } } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 21 of 24] sao: move sao apply function into encode loop
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511603 21600 # Node ID 64cc11dff87ca95418e8812acdfe69ed3f93006f # Parent 690f1e3baab270884b3f00bd56006738ad4a5314 sao: move sao apply function into encode loop --- source/encoder/frameencoder.cpp | 15 +++ source/encoder/framefilter.cpp | 51 +++--- source/encoder/framefilter.h|7 - 3 files changed, 62 insertions(+), 11 deletions(-) diff -r 690f1e3baab2 -r 64cc11dff87c source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:06:41 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:06:43 2015 -0600 @@ -1234,6 +1234,21 @@ m_frameFilter.m_parallelFilter[row].waitForExit(); m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols); m_frameFilter.m_parallelFilter[row].processTasks(-1); + +/* Apply SAO on last row of CUs */ +if (m_param->bEnableSAO) +{ +FrameData* encData = m_frameFilter.m_parallelFilter[row].m_encData; +SAOParam* saoParam = encData->m_saoParam; +for(uint32_t col = 0; col < numCols; col++) +{ +if (saoParam->bSaoFlag[0]) + m_frameFilter.m_parallelFilter[row].m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], row, col); + +if (saoParam->bSaoFlag[1]) + m_frameFilter.m_parallelFilter[row].m_sao.processSaoUnitCuChroma(saoParam->ctuParam, row, col); +} +} } } diff -r 690f1e3baab2 -r 64cc11dff87c source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:41 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:43 2015 -0600 @@ -36,6 +36,7 @@ static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt); uint32_t FrameFilter::ParallelFilter::numCols = 0; +uint32_t FrameFilter::ParallelFilter::numRows = 0; void FrameFilter::destroy() { @@ -92,13 +93,18 @@ for(int row = 0; row < numRows; row++) { m_parallelFilter[row].m_param = m_param; +m_parallelFilter[row].m_row = row; m_parallelFilter[row].m_rowAddr = row * numCols; m_parallelFilter[row].m_frameEncoder = m_frameEncoder; + +if (row > 0) +m_parallelFilter[row].m_prevRow = _parallelFilter[row - 1]; } } // Setting maximum columns ParallelFilter::numCols = numCols; +ParallelFilter::numRows = numRows; } void FrameFilter::start(Frame *frame, Entropy& initState, int qp) @@ -192,6 +198,16 @@ // ..S H V | m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, col - 2, cuAddr - 2); } + +// Process Previous Row SAO CU +if (m_row >= 1 && col >= 3) +{ +if (saoParam->bSaoFlag[0]) + m_prevRow->m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row - 1, col - 3); + +if (saoParam->bSaoFlag[1]) + m_prevRow->m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row - 1, col - 3); +} } m_lastDeblocked.set(col - 1); @@ -221,6 +237,31 @@ if (numCols >= 1) m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 1, cuAddr); + +// Process Previous Row SAO CU +if (saoParam->bSaoFlag[0]) +{ +if (m_row >= 1 && numCols >= 3) + m_prevRow->m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row - 1, numCols - 3); + +if (m_row >= 1 && numCols >= 2) + m_prevRow->m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row - 1, numCols - 2); + +if (m_row >= 1 && numCols >= 1) + m_prevRow->m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row - 1, numCols - 1); +} + +if (saoParam->bSaoFlag[1]) +{ +if (m_row >= 1 && numCols >= 3) + m_prevRow->m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row - 1, numCols - 3); + +if (m_row >= 1 && numCols >= 2) + m_prevRow->m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row - 1, numCols - 2); + +if (m_row >= 1 && numCols >= 1) + m_prevRow->m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row - 1, numCols - 1); +} } m_lastDeblocked.set(numCols - 1); } @@ -573,18 +61
[x265] [PATCH 22 of 24] sao: move common function into new processSaoUnitCu()
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511606 21600 # Node ID 188f52d6a9ea87876d09814126e21b4be2df5248 # Parent 64cc11dff87ca95418e8812acdfe69ed3f93006f sao: move common function into new processSaoUnitCu() --- source/encoder/framefilter.cpp | 47 +++ source/encoder/framefilter.h |5 2 files changed, 23 insertions(+), 29 deletions(-) diff -r 64cc11dff87c -r 188f52d6a9ea source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:43 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:46 2015 -0600 @@ -152,6 +152,15 @@ X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected"); } +void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col) +{ +if (saoParam->bSaoFlag[0]) +m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row, col); + +if (saoParam->bSaoFlag[1]) +m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row, col); +} + // NOTE: Single Threading only void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/) { @@ -202,11 +211,8 @@ // Process Previous Row SAO CU if (m_row >= 1 && col >= 3) { -if (saoParam->bSaoFlag[0]) - m_prevRow->m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row - 1, col - 3); - -if (saoParam->bSaoFlag[1]) - m_prevRow->m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row - 1, col - 3); +// Must delay 1 row to avoid thread data race conflict +m_prevRow->processSaoUnitCu(saoParam, col - 3); } } @@ -238,30 +244,13 @@ if (numCols >= 1) m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 1, cuAddr); -// Process Previous Row SAO CU -if (saoParam->bSaoFlag[0]) -{ -if (m_row >= 1 && numCols >= 3) - m_prevRow->m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row - 1, numCols - 3); - -if (m_row >= 1 && numCols >= 2) - m_prevRow->m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row - 1, numCols - 2); - -if (m_row >= 1 && numCols >= 1) - m_prevRow->m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row - 1, numCols - 1); -} - -if (saoParam->bSaoFlag[1]) -{ -if (m_row >= 1 && numCols >= 3) - m_prevRow->m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row - 1, numCols - 3); - -if (m_row >= 1 && numCols >= 2) - m_prevRow->m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row - 1, numCols - 2); - -if (m_row >= 1 && numCols >= 1) - m_prevRow->m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row - 1, numCols - 1); -} +// Process Previous Rows SAO CU +if (m_row >= 1 && numCols >= 3) +m_prevRow->processSaoUnitCu(saoParam, numCols - 3); +if (m_row >= 1 && numCols >= 2) +m_prevRow->processSaoUnitCu(saoParam, numCols - 2); +if (m_row >= 1 && numCols >= 1) +m_prevRow->processSaoUnitCu(saoParam, numCols - 1); } m_lastDeblocked.set(numCols - 1); } diff -r 64cc11dff87c -r 188f52d6a9ea source/encoder/framefilter.h --- a/source/encoder/framefilter.h Mon Dec 07 12:06:43 2015 -0600 +++ b/source/encoder/framefilter.h Mon Dec 07 12:06:46 2015 -0600 @@ -88,6 +88,11 @@ { } void processTasks(int workerThreadId); + +// Apply SAO on a CU in current row +void processSaoUnitCu(SAOParam *saoParam, int col); + +// Copy and Save SAO reference pixels for SAO Rdo decide void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col); protected: ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 12 of 24] sao: new CU level process function
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511579 21600 # Node ID 47deea6d4e4f4aba6fdf6c210835bb843c9a4d83 # Parent 1875f9ae42c05e63bcd3f1a926b93f8d9b9fd85c sao: new CU level process function --- source/encoder/framefilter.cpp | 13 +-- source/encoder/sao.cpp | 68 source/encoder/sao.h |1 + 3 files changed, 78 insertions(+), 4 deletions(-) diff -r 1875f9ae42c0 -r 47deea6d4e4f source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:16 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:19 2015 -0600 @@ -541,19 +541,24 @@ { FrameData& encData = *m_frame->m_encData; SAOParam* saoParam = encData.m_saoParam; +uint32_t numCols = encData.m_slice->m_sps->numCuInWidth; if (saoParam->bSaoFlag[0]) -m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0); +{ +for(uint32_t col = 0; col < numCols; col++) + m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[0], row, col, 0); +} if (saoParam->bSaoFlag[1]) { -m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1); -m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2); +for(uint32_t col = 0; col < numCols; col++) + m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[1], row, col, 1); +for(uint32_t col = 0; col < numCols; col++) + m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[2], row, col, 2); } if (encData.m_slice->m_pps->bTransquantBypassEnabled) { -uint32_t numCols = encData.m_slice->m_sps->numCuInWidth; uint32_t lineStartCUAddr = row * numCols; const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms; diff -r 1875f9ae42c0 -r 47deea6d4e4f source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:16 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:19 2015 -0600 @@ -671,6 +671,74 @@ } } +/* Process SAO unit */ +void SAO::processSaoUnitCu(SaoCtuParam* ctuParam, int idxY, int idxX, int plane) +{ +PicYuv* reconPic = m_frame->m_reconPic; +intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; +uint32_t picWidth = m_param->sourceWidth; +int ctuWidth = g_maxCUSize; +int ctuHeight = g_maxCUSize; + +if (plane) +{ +picWidth >>= m_hChromaShift; +ctuWidth >>= m_hChromaShift; +ctuHeight >>= m_vChromaShift; +} + +int addr = idxY * m_numCuInWidth + idxX; +pixel* rec = reconPic->getPlaneAddr(plane, addr); + +if (idxX == 0) +{ +for (int i = 0; i < ctuHeight + 1; i++) +{ +m_tmpL1[plane][i] = rec[0]; +rec += stride; +} +} + +bool mergeLeftFlag = (ctuParam[addr].mergeMode == SAO_MERGE_LEFT); +int typeIdx = ctuParam[addr].typeIdx; + +if (idxX != (m_numCuInWidth - 1)) +{ +rec = reconPic->getPlaneAddr(plane, addr); +for (int i = 0; i < ctuHeight + 1; i++) +{ +m_tmpL2[plane][i] = rec[ctuWidth - 1]; +rec += stride; +} +} + +if (typeIdx >= 0) +{ +if (!mergeLeftFlag) +{ +if (typeIdx == SAO_BO) +{ +memset(m_offsetBo, 0, sizeof(m_offsetBo)); + +for (int i = 0; i < SAO_NUM_OFFSET; i++) +m_offsetBo[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); +} +else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) +{ +int offset[NUM_EDGETYPE]; +offset[0] = 0; +for (int i = 0; i < SAO_NUM_OFFSET; i++) +offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC; + +for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) +m_offsetEo[edgeType] = (int8_t)offset[s_eoTable[edgeType]]; +} +} +processSaoCu(addr, typeIdx, plane); +} +std::swap(m_tmpL1[plane], m_tmpL2[plane]); +} + void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc) { saoUnitDst->mergeMode = saoUnitSrc->mergeMode; diff -r 1875f9ae42c0 -r 47deea6d4e4f source/encoder/sao.h --- a/source/encoder/sao.h Mon Dec 07 12:06:16 2015 -0600 +++ b/source/encoder/sao.h Mon Dec 07 12:06:19 2015 -0600 @@ -132,6 +132,7 @@ // CTU-based SAO process without slice granularity void processSaoCu(int addr, int typeIdx, int plane); void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane); +void processS
[x265] [PATCH 18 of 24] sao: convert dynamic memory alloc to class static memory
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511595 21600 # Node ID 04c67fe19c5fbf025ecddbdd59f6d71f73539f58 # Parent 294ae06be8aab74f7113a60a4abb0b63efc18ea3 sao: convert dynamic memory alloc to class static memory --- source/encoder/sao.cpp | 25 +++-- source/encoder/sao.h |6 +++--- 2 files changed, 10 insertions(+), 21 deletions(-) diff -r 294ae06be8aa -r 04c67fe19c5f source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:32 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:35 2015 -0600 @@ -73,9 +73,6 @@ SAO::SAO() { -m_count = NULL; -m_offset = NULL; -m_offsetOrg = NULL; m_countPreDblk = NULL; m_offsetOrgPreDblk = NULL; m_refDepth = 0; @@ -132,10 +129,6 @@ m_tmpU[i] += 1; } -CHECKED_MALLOC(m_count, PerClass, NUM_PLANE); -CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE); -CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE); - if (initCommon) { CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu); @@ -199,10 +192,6 @@ } } -if (m_count) X265_FREE_ZERO(m_count); -if (m_offset) X265_FREE_ZERO(m_offset); -if (m_offsetOrg) X265_FREE_ZERO(m_offsetOrg); - if (destoryCommon) { X265_FREE_ZERO(m_countPreDblk); @@ -1214,9 +1203,9 @@ /* reset offset statistics */ void SAO::resetStats() { -memset(m_count, 0, sizeof(PerClass) * NUM_PLANE); -memset(m_offset, 0, sizeof(PerClass) * NUM_PLANE); -memset(m_offsetOrg, 0, sizeof(PerClass) * NUM_PLANE); +memset(m_count, 0, sizeof(m_count)); +memset(m_offset, 0, sizeof(m_offset)); +memset(m_offsetOrg, 0, sizeof(m_offsetOrg)); } void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus) @@ -1259,13 +1248,13 @@ // TODO: Confirm the address space is continuous if (m_param->bSaoNonDeblocked) { -memcpy(m_count, m_countPreDblk[addr], 3 * sizeof(m_count[0])); -memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], 3 * sizeof(m_offsetOrg[0])); +memcpy(m_count, m_countPreDblk[addr], sizeof(m_count)); +memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg)); } else { -memset(m_count, 0, 3 * sizeof(m_count[0])); -memset(m_offsetOrg, 0, 3 * sizeof(m_offsetOrg[0])); +memset(m_count, 0, sizeof(m_count)); +memset(m_offsetOrg, 0, sizeof(m_offsetOrg)); } saoParam->ctuParam[0][addr].reset(); diff -r 294ae06be8aa -r 04c67fe19c5f source/encoder/sao.h --- a/source/encoder/sao.h Mon Dec 07 12:06:32 2015 -0600 +++ b/source/encoder/sao.h Mon Dec 07 12:06:35 2015 -0600 @@ -71,9 +71,9 @@ protected: /* allocated per part */ -PerClass* m_count; -PerClass* m_offset; -PerClass* m_offsetOrg; +PerPlanem_count; +PerPlanem_offset; +PerPlanem_offsetOrg; /* allocated per CTU */ PerPlane* m_countPreDblk; ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 20 of 24] sao: move SAO RDO Decide into encode loop
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511601 21600 # Node ID 690f1e3baab270884b3f00bd56006738ad4a5314 # Parent f023dda04a265ff507746af68c213e61303805f6 sao: move SAO RDO Decide into encode loop --- source/encoder/frameencoder.cpp | 77 ++--- source/encoder/framefilter.cpp | 41 ++- source/encoder/framefilter.h|1 + source/encoder/sao.cpp | 140 ++ source/encoder/sao.h|4 +- 5 files changed, 217 insertions(+), 46 deletions(-) diff -r f023dda04a26 -r 690f1e3baab2 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:06:38 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:06:41 2015 -0600 @@ -1107,7 +1107,14 @@ m_frameFilter.m_parallelFilter[row - 1].waitForExit(); // Processing new group -const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col) : col); +int allowCol = col; + +// avoid race condition on last column +if (row >= 2) +{ +allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() + : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col); +} m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol); m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1); } @@ -1119,7 +1126,14 @@ m_frameFilter.m_parallelFilter[row].waitForExit(); // Processing last row -const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col) : col); +int allowCol = col; + +// avoid race condition on last column +if (row >= 2) +{ +allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() + : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col); +} m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol); m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1); } @@ -1183,26 +1197,48 @@ if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1)) rowCoder.finishSlice(); +/* Processing left Deblock block with current threading */ +if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1)) +{ +/* TODO: Multiple Threading */ +/* Check to avoid previous row process slower than current row */ +if (row >= 2) +{ +int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get(); +while(prevCol != (int)numCols - 1) +prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.waitForChange(prevCol); +} +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); +m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); +} + +/* trigger row-wise loop filters */ +if (row == m_numRows - 1) +{ +/* TODO: Early start last row */ +if (m_param->bEnableLoopFilter | m_param->bEnableSAO) +{ +if (m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get() != (int)numCols - 1) +x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n"); + +// avoid race on last row and last column +if (row >= 1) +{ +int prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get(); +while(prevCol != (int)numCols - 1) +prevCol = m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.waitForChange(prevCol); +} + +/* NOTE: Last Row not execute before, so didn't need wait */ +m_frameFilter.m_parallelFilter[row].waitForExit(); +m_frameFilter.m_parallelFilter[row].m_allowedCol.set(numCols); +m_frameFilter.m_parallelFilter[row].processTasks(-1); +} +} + if (m_param->bEnableWavefront) { -/* Processing left Deblock block with current threading */ -if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1)) -{ -/* TODO: Multiple Threading */ -m_frameFilter.m_parallelFilter[row - 1].waitForExit(); - -/* Check to avoid previous row process slower than cu
[x265] [PATCH 14 of 24] sao: reduce address operators by split into Luma and Chroma path
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511584 21600 # Node ID 86558049b77fa9838a4048229cc76ee9587356b8 # Parent 25506e562e457a5e538cdd3c7b5ab974aa75f68d sao: reduce address operators by split into Luma and Chroma path --- source/encoder/framefilter.cpp |7 +-- source/encoder/sao.cpp | 133 ++-- source/encoder/sao.h |3 +- 3 files changed, 118 insertions(+), 25 deletions(-) diff -r 25506e562e45 -r 86558049b77f source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:21 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:24 2015 -0600 @@ -546,13 +546,10 @@ for(uint32_t col = 0; col < numCols; col++) { if (saoParam->bSaoFlag[0]) - m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[0], row, col, 0); + m_parallelFilter[row].m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], row, col); if (saoParam->bSaoFlag[1]) -{ - m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[1], row, col, 1); - m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[2], row, col, 2); -} + m_parallelFilter[row].m_sao.processSaoUnitCuChroma(saoParam->ctuParam, row, col); } if (encData.m_slice->m_pps->bTransquantBypassEnabled) diff -r 25506e562e45 -r 86558049b77f source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:21 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:24 2015 -0600 @@ -674,29 +674,21 @@ } /* Process SAO unit */ -void SAO::processSaoUnitCu(SaoCtuParam* ctuParam, int idxY, int idxX, int plane) +void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX) { PicYuv* reconPic = m_frame->m_reconPic; -intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; -uint32_t picWidth = m_param->sourceWidth; +intptr_t stride = reconPic->m_stride; int ctuWidth = g_maxCUSize; int ctuHeight = g_maxCUSize; -if (plane) -{ -picWidth >>= m_hChromaShift; -ctuWidth >>= m_hChromaShift; -ctuHeight >>= m_vChromaShift; -} - int addr = idxY * m_numCuInWidth + idxX; -pixel* rec = reconPic->getPlaneAddr(plane, addr); +pixel* rec = reconPic->getLumaAddr(addr); if (idxX == 0) { for (int i = 0; i < ctuHeight + 1; i++) { -m_tmpL1[plane][i] = rec[0]; +m_tmpL1[0][i] = rec[0]; rec += stride; } } @@ -706,10 +698,10 @@ if (idxX != (m_numCuInWidth - 1)) { -rec = reconPic->getPlaneAddr(plane, addr); +rec = reconPic->getLumaAddr(addr); for (int i = 0; i < ctuHeight + 1; i++) { -m_tmpL2[plane][i] = rec[ctuWidth - 1]; +m_tmpL2[0][i] = rec[ctuWidth - 1]; rec += stride; } } @@ -720,10 +712,10 @@ { if (typeIdx == SAO_BO) { -memset(m_offsetBo[plane], 0, sizeof(m_offsetBo[0])); +memset(m_offsetBo[0], 0, sizeof(m_offsetBo[0])); for (int i = 0; i < SAO_NUM_OFFSET; i++) -m_offsetBo[plane][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); +m_offsetBo[0][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); } else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) { @@ -733,12 +725,115 @@ offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC; for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) -m_offsetEo[plane][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; +m_offsetEo[0][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; } } -processSaoCu(addr, typeIdx, plane); +processSaoCu(addr, typeIdx, 0); } -std::swap(m_tmpL1[plane], m_tmpL2[plane]); +std::swap(m_tmpL1[0], m_tmpL2[0]); +} + +/* Process SAO unit (Chroma only) */ +void SAO::processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX) +{ +PicYuv* reconPic = m_frame->m_reconPic; +intptr_t stride = reconPic->m_strideC; +int ctuWidth = g_maxCUSize; +int ctuHeight = g_maxCUSize; + +{ +ctuWidth >>= m_hChromaShift; +ctuHeight >>= m_vChromaShift; +} + +int addr = idxY * m_numCuInWidth + idxX; +pixel* recCb = reconPic->getCbAddr(addr); +pixel* recCr = reconPic->getCrAddr(addr); + +if (idxX == 0) +{ +for (int i = 0; i < ctuHeight + 1; i++) +
[x265] [PATCH 23 of 24] sao: fix lossless logic and remove unnecessary function processSao()
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511608 21600 # Node ID a2d20844f461fcd91c1bd966f59b6cfb08358e33 # Parent 188f52d6a9ea87876d09814126e21b4be2df5248 sao: fix lossless logic and remove unnecessary function processSao() --- source/encoder/framefilter.cpp | 134 +--- source/encoder/framefilter.h |1 - 2 files changed, 57 insertions(+), 78 deletions(-) diff -r 188f52d6a9ea -r a2d20844f461 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:46 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:48 2015 -0600 @@ -131,6 +131,51 @@ } } +/* restore original YUV samples to recon after SAO (if lossless) */ +static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx) +{ +int size = cu->m_log2CUSize[absPartIdx] - 2; +uint32_t cuAddr = cu->m_cuAddr; + +PicYuv* reconPic = frame.m_reconPic; +PicYuv* fencPic = frame.m_fencPic; + +pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx); +pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx); + +primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride); + +pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx); +pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx); + +pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx); +pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx); + +int csp = fencPic->m_picCsp; +primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC); +primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC); +} + +/* Original YUV restoration for CU in lossless coding */ +static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame) +{ +uint32_t absPartIdx = cuGeom.absPartIdx; +if (cu->m_cuDepth[absPartIdx] > cuGeom.depth) +{ +for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++) +{ +const CUGeom& childGeom = *( + cuGeom.childOffset + subPartIdx); +if (childGeom.flags & CUGeom::PRESENT) +origCUSampleRestoration(cu, childGeom, frame); +} +return; +} + +// restore original YUV samples +if (cu->m_tqBypass[absPartIdx]) +restoreOrigLosslessYuv(cu, frame, absPartIdx); +} + void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col) { // Copy SAO Top Reference Pixels @@ -154,11 +199,22 @@ void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col) { +// TODO: apply SAO on CU and copy back soon, is it necessary? if (saoParam->bSaoFlag[0]) m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row, col); if (saoParam->bSaoFlag[1]) m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row, col); + +if (m_encData->m_slice->m_pps->bTransquantBypassEnabled) +{ +const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms; +const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap; + +uint32_t cuAddr = m_rowAddr + col; +const CUData* ctu = m_encData->getPicCTU(cuAddr); +origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frameEncoder->m_frame); +} } // NOTE: Single Threading only @@ -272,14 +328,8 @@ } FrameData& encData = *m_frame->m_encData; -// SAO +// SAO: was integrate into encode loop SAOParam* saoParam = encData.m_saoParam; -if (m_param->bEnableSAO) -{ -// NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug? -if (row >= m_saoRowDelay) -processSao(row - m_saoRowDelay); -} // this row of CTUs has been encoded @@ -298,9 +348,6 @@ } m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame); - -for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++) -processSao(i); } processRowPost(row); @@ -554,70 +601,3 @@ cnt = (height - 1) * (width - 1); return ssim; } - -/* restore original YUV samples to recon after SAO (if lossless) */ -static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx) -{ -int size = cu->m_log2CUSize[absPartIdx] - 2; -uint32_t cuAddr = cu->m_cuAddr; - -PicYuv* reconPic = frame.m_reconPic; -PicYuv* fencPic = frame.m_fencPic; - -pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx); -pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx); - -primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride); - -pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx); -pixel* srcCb
[x265] [PATCH 16 of 24] sao: cleanup unnecessary memset on m_count
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511590 21600 # Node ID 6180f5987872c4b0c39f22ca3797ef82694ef781 # Parent 42a01b5f1c7cb70522e9d516c22af69c0b1e8cf0 sao: cleanup unnecessary memset on m_count --- source/encoder/sao.cpp |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff -r 42a01b5f1c7c -r 6180f5987872 source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:27 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:30 2015 -0600 @@ -1266,7 +1266,6 @@ X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane"); // TODO: Confirm the address space is continuous -memset(m_count, 0, 3 * sizeof(m_count[0])); if (m_param->bSaoNonDeblocked) { memcpy(m_count, m_countPreDblk[addr], 3 * sizeof(m_count[0])); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 07 of 24] simplify control logic on Deblock Disable and Sao Enable
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511565 21600 # Node ID 7d6d5444aa9704b092bcf5ff23c5c50773e08f72 # Parent b2c551a2927e2ee0852d1983da9226ef3c2c1871 simplify control logic on Deblock Disable and Sao Enable --- source/encoder/frameencoder.cpp | 18 ++ source/encoder/framefilter.cpp | 23 +-- 2 files changed, 23 insertions(+), 18 deletions(-) diff -r b2c551a2927e -r 7d6d5444aa97 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Dec 07 12:06:03 2015 -0600 +++ b/source/encoder/frameencoder.cpp Mon Dec 07 12:06:05 2015 -0600 @@ -104,7 +104,8 @@ m_param = top->m_param; m_numRows = numRows; m_numCols = numCols; -m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ? +m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked) +|| (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ? 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0); m_filterRowDelayCus = m_filterRowDelay * numCols; m_rows = new CTURow[m_numRows]; @@ -1096,11 +1097,11 @@ m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); /* Deblock with idle threading */ -if (m_param->bEnableLoopFilter) +if (m_param->bEnableLoopFilter | m_param->bEnableSAO) { // TODO: Multiple Threading // Delay ONE row to avoid Intra Prediction Conflict -if (row > 0) +if (row >= 1) { // Waitting last threading finish m_frameFilter.m_parallelFilter[row - 1].waitForExit(); @@ -1124,13 +1125,6 @@ } } -/* Case of DEBLOCK Disable and SAO Enable */ -if (!m_param->bEnableLoopFilter && m_param->bEnableSAO) -{ -PicYuv* reconPic = curEncData.m_reconPic; -m_frameFilter.m_parallelFilter[row].copySaoAboveRef(reconPic, cuAddr, col); -} - if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) { @@ -1192,7 +1186,7 @@ if (m_param->bEnableWavefront) { /* Processing left Deblock block with current threading */ -if (m_param->bEnableLoopFilter & (row > 0)) +if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1)) { /* TODO: Multiple Threading */ m_frameFilter.m_parallelFilter[row - 1].waitForExit(); @@ -1222,7 +1216,7 @@ if (row == m_numRows - 1) { /* TODO: Early start last row */ -if (m_param->bEnableLoopFilter) +if (m_param->bEnableLoopFilter | m_param->bEnableSAO) { X265_CHECK(m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed"); diff -r b2c551a2927e -r 7d6d5444aa97 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppMon Dec 07 12:06:03 2015 -0600 +++ b/source/encoder/framefilter.cppMon Dec 07 12:06:05 2015 -0600 @@ -162,13 +162,20 @@ for (uint32_t col = (uint32_t)colStart; col < (uint32_t)colEnd; col++) { const uint32_t cuAddr = m_rowAddr + col; -const CUData* ctu = m_encData->getPicCTU(cuAddr); -deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER); + +if (m_param->bEnableLoopFilter) +{ +const CUData* ctu = m_encData->getPicCTU(cuAddr); +deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER); +} if (col > 0) { -const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1); -deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR); +if (m_param->bEnableLoopFilter) +{ +const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1); +deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR); +} if (m_param->bEnableSAO) copySaoAboveRef(reconPic, cuAddr - 1, col - 1); @@ -179,8 +186,12 @@ if (colEnd == (int)numCols) { const uint32_t cuAddr = m_rowAddr + numCols - 1; -const CUData* ctuPrev = m_encData->getPicCTU(cuAddr); -deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR); + +if (m_param->bEnableLoopFilter) +{ +const CUData* ctuPrev = m_encData->getPicCTU(cuAddr); +deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR); +} if (m_param->bEnableSAO) copySaoAboveRef(reconPic, cuAddr, numCols - 1); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 17 of 24] sao: individual statistics data every row
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511592 21600 # Node ID 294ae06be8aab74f7113a60a4abb0b63efc18ea3 # Parent 6180f5987872c4b0c39f22ca3797ef82694ef781 sao: individual statistics data every row --- source/encoder/sao.cpp | 29 ++--- 1 files changed, 10 insertions(+), 19 deletions(-) diff -r 6180f5987872 -r 294ae06be8aa source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:30 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:32 2015 -0600 @@ -132,21 +132,18 @@ m_tmpU[i] += 1; } +CHECKED_MALLOC(m_count, PerClass, NUM_PLANE); +CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE); +CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE); + if (initCommon) { -CHECKED_MALLOC(m_count, PerClass, NUM_PLANE); -CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE); -CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE); - CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu); CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu); } else { // must initialize these common pointer outside of function -m_count = NULL; -m_offset = NULL; -m_offsetOrg = NULL; m_countPreDblk = NULL; m_offsetOrgPreDblk = NULL; } @@ -170,15 +167,9 @@ void SAO::createFromRootNode(SAO* root) { -X265_CHECK(m_count == NULL, "duplicate initialize on m_count"); -X265_CHECK(m_offset == NULL, "duplicate initialize on m_offset"); -X265_CHECK(m_offsetOrg == NULL, "duplicate initialize on m_offsetOrg"); X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk"); X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk"); -m_count = root->m_count; -m_offset = root->m_offset; -m_offsetOrg = root->m_offsetOrg; m_countPreDblk = root->m_countPreDblk; m_offsetOrgPreDblk = root->m_offsetOrgPreDblk; } @@ -187,7 +178,6 @@ { X265_FREE_ZERO(m_clipTableBase); - for (int i = 0; i < 3; i++) { if (m_tmpL1[i]) @@ -209,13 +199,14 @@ } } +if (m_count) X265_FREE_ZERO(m_count); +if (m_offset) X265_FREE_ZERO(m_offset); +if (m_offsetOrg) X265_FREE_ZERO(m_offsetOrg); + if (destoryCommon) { -X265_FREE(m_count); -X265_FREE(m_offset); -X265_FREE(m_offsetOrg); -X265_FREE(m_countPreDblk); -X265_FREE(m_offsetOrgPreDblk); +X265_FREE_ZERO(m_countPreDblk); +X265_FREE_ZERO(m_offsetOrgPreDblk); } } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 10 of 24] sao: change left column copy logic, move copy code outside of processSaoCu()
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511574 21600 # Node ID c85a460a38fee52477971c492602ab0fa0d19fb2 # Parent 63809496ca0caa713f09fed495520f13006833cb sao: change left column copy logic, move copy code outside of processSaoCu() --- source/encoder/sao.cpp | 37 + 1 files changed, 13 insertions(+), 24 deletions(-) diff -r 63809496ca0c -r c85a460a38fe source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:10 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:14 2015 -0600 @@ -307,17 +307,8 @@ memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */ -{ -const pixel* recR = [ctuWidth - 1]; -for (int i = 0; i < ctuHeight + 1; i++) -{ -m_tmpL2[i] = *recR; -recR += stride; -} - -tmpL = m_tmpL1; -tmpU = &(m_tmpU[plane][lpelx]); -} +tmpL = m_tmpL1; +tmpU = &(m_tmpU[plane][lpelx]); switch (typeIdx) { @@ -593,9 +584,6 @@ } default: break; } - -// if (iSaoType!=SAO_BO_0 || iSaoType!=SAO_BO_1) -std::swap(m_tmpL1, m_tmpL2); } /* Process SAO all units */ @@ -630,6 +618,16 @@ bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT; int typeIdx = ctuParam[addr].typeIdx; +if (idxX != (m_numCuInWidth - 1)) +{ +rec = reconPic->getPlaneAddr(plane, addr); +for (int i = 0; i < ctuHeight + 1; i++) +{ +m_tmpL2[i] = rec[ctuWidth - 1]; +rec += stride; +} +} + if (typeIdx >= 0) { if (!mergeLeftFlag) @@ -654,16 +652,7 @@ } processSaoCu(addr, typeIdx, plane); } -else if (idxX != (m_numCuInWidth - 1)) -{ -rec = reconPic->getPlaneAddr(plane, addr); - -for (int i = 0; i < ctuHeight + 1; i++) -{ -m_tmpL1[i] = rec[ctuWidth - 1]; -rec += stride; -} -} +std::swap(m_tmpL1, m_tmpL2); } } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 01 of 24] Convert Deblock functions to statis
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511549 21600 # Node ID 4f6b549198244291d25d6d2a0208e212960237c1 # Parent b7ca5ebd7fcdcd4af0ef5ae567e88c04b7694e46 Convert Deblock functions to statis --- source/common/deblock.cpp | 16 source/common/deblock.h | 16 2 files changed, 16 insertions(+), 16 deletions(-) diff -r b7ca5ebd7fcd -r 4f6b54919824 source/common/deblock.cpp --- a/source/common/deblock.cpp Mon Dec 07 11:34:23 2015 -0600 +++ b/source/common/deblock.cpp Mon Dec 07 12:05:49 2015 -0600 @@ -34,7 +34,7 @@ #define DEBLOCK_SMALLEST_BLOCK 8 #define DEFAULT_INTRA_TC_OFFSET 2 -void Deblock::deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir) const +void Deblock::deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir) { uint8_t blockStrength[MAX_NUM_PARTITIONS]; @@ -69,7 +69,7 @@ /* Deblocking filter process in CU-based (the same function as conventional's) * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */ -void Deblock::deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]) const +void Deblock::deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]) { uint32_t absPartIdx = cuGeom.absPartIdx; uint32_t depth = cuGeom.depth; @@ -124,7 +124,7 @@ return g_rasterToZscan[g_zscanToRaster[absPartIdx] + baseUnitIdx * numUnits + edgeIdx]; } -void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits) const +void Deblock::setEdgefilterMultiple(const CUData* cu, uint32_t scanIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits) { X265_CHECK(numUnits > 0, "numUnits edge filter check\n"); for (uint32_t i = 0; i < numUnits; i++) @@ -134,7 +134,7 @@ } } -void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]) const +void Deblock::setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]) { uint32_t log2TrSize = cu->m_log2CUSize[absPartIdx] - tuDepth; if (cu->m_tuDepth[absPartIdx] > tuDepth) @@ -149,7 +149,7 @@ setEdgefilterMultiple(cu, absPartIdx, dir, 0, 2, blockStrength, numUnits); } -void Deblock::setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits) const +void Deblock::setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits) { const uint32_t hNumUnits = numUnits >> 1; const uint32_t qNumUnits = numUnits >> 2; @@ -190,7 +190,7 @@ } } -uint8_t Deblock::getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]) const +uint8_t Deblock::getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]) { // Calculate block index uint32_t partP; @@ -340,7 +340,7 @@ } } -void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]) const +void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]) { PicYuv* reconPic = cuQ->m_encData->m_reconPic; pixel* src = reconPic->getLumaAddr(cuQ->m_cuAddr, absPartIdx); @@ -440,7 +440,7 @@ } } -void Deblock::edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]) const +void Deblock::edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]) { int32_t chFmt = cuQ->m_chromaFormat, chromaShift; intptr_t offset, srcStep; diff -r b7ca5ebd7fcd -r 4f6b54919824 source/common/deblock.h --- a/source/common/deblock.h Mon Dec 07 11:34:23 2015 -0600 +++ b/source/common/deblock.h Mon Dec 07 12:05:49 2015 -0600 @@ -38,24 +38,24 @@ public: enum { EDGE_VER, EDGE_HOR }; -void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir) const; +static void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir); protected: // CU-level deblocking function -void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]) const; +static void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]); // set filtering functions -void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]) const; -void
[x265] [PATCH 11 of 24] sao: split SAO Left reference pixel buffer into row base
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511576 21600 # Node ID 1875f9ae42c05e63bcd3f1a926b93f8d9b9fd85c # Parent c85a460a38fee52477971c492602ab0fa0d19fb2 sao: split SAO Left reference pixel buffer into row base --- source/encoder/sao.cpp | 35 +-- source/encoder/sao.h |4 ++-- 2 files changed, 27 insertions(+), 12 deletions(-) diff -r c85a460a38fe -r 1875f9ae42c0 source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:06:14 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:06:16 2015 -0600 @@ -87,8 +87,12 @@ m_tmpU[0] = NULL; m_tmpU[1] = NULL; m_tmpU[2] = NULL; -m_tmpL1 = NULL; -m_tmpL2 = NULL; +m_tmpL1[0] = NULL; +m_tmpL1[1] = NULL; +m_tmpL1[2] = NULL; +m_tmpL2[0] = NULL; +m_tmpL2[1] = NULL; +m_tmpL2[2] = NULL; m_depthSaoRate[0][0] = 0; m_depthSaoRate[0][1] = 0; @@ -116,11 +120,12 @@ CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt); -CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1); -CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1); for (int i = 0; i < 3; i++) { +CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1); +CHECKED_MALLOC(m_tmpL2[i], pixel, g_maxCUSize + 1); + // SAO asm code will read 1 pixel before and after, so pad by 2 // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2); @@ -182,11 +187,21 @@ { X265_FREE_ZERO(m_clipTableBase); -X265_FREE_ZERO(m_tmpL1); -X265_FREE_ZERO(m_tmpL2); for (int i = 0; i < 3; i++) { +if (m_tmpL1[i]) +{ +X265_FREE(m_tmpL1[i]); +m_tmpL1[i] = NULL; +} + +if (m_tmpL2[i]) +{ +X265_FREE(m_tmpL2[i]); +m_tmpL2[i] = NULL; +} + if (m_tmpU[i]) { X265_FREE(m_tmpU[i] - 1); @@ -307,7 +322,7 @@ memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */ -tmpL = m_tmpL1; +tmpL = m_tmpL1[plane]; tmpU = &(m_tmpU[plane][lpelx]); switch (typeIdx) @@ -607,7 +622,7 @@ for (int i = 0; i < ctuHeight + 1; i++) { -m_tmpL1[i] = rec[0]; +m_tmpL1[plane][i] = rec[0]; rec += stride; } @@ -623,7 +638,7 @@ rec = reconPic->getPlaneAddr(plane, addr); for (int i = 0; i < ctuHeight + 1; i++) { -m_tmpL2[i] = rec[ctuWidth - 1]; +m_tmpL2[plane][i] = rec[ctuWidth - 1]; rec += stride; } } @@ -652,7 +667,7 @@ } processSaoCu(addr, typeIdx, plane); } -std::swap(m_tmpL1, m_tmpL2); +std::swap(m_tmpL1[plane], m_tmpL2[plane]); } } diff -r c85a460a38fe -r 1875f9ae42c0 source/encoder/sao.h --- a/source/encoder/sao.h Mon Dec 07 12:06:14 2015 -0600 +++ b/source/encoder/sao.h Mon Dec 07 12:06:16 2015 -0600 @@ -93,8 +93,8 @@ pixel* m_clipTableBase; pixel* m_tmpU[3]; -pixel* m_tmpL1; -pixel* m_tmpL2; +pixel* m_tmpL1[3]; +pixel* m_tmpL2[3]; public: ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 04 of 24] optimize SAO statistics initialize
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449511557 21600 # Node ID 3542d3abd018491d6ad67a79b0e6d05b604d3818 # Parent 2c6a7879eca09d28a8bcc467c0186f40b387fdd6 optimize SAO statistics initialize --- source/encoder/sao.cpp | 50 +-- 1 files changed, 27 insertions(+), 23 deletions(-) diff -r 2c6a7879eca0 -r 3542d3abd018 source/encoder/sao.cpp --- a/source/encoder/sao.cppMon Dec 07 12:05:55 2015 -0600 +++ b/source/encoder/sao.cppMon Dec 07 12:05:57 2015 -0600 @@ -1146,30 +1146,34 @@ if (allowMerge[1]) m_entropyCoder.codeSaoMerge(0); m_entropyCoder.store(m_rdContexts.temp); + // reset stats Y, Cb, Cr -for (int plane = 0; plane < 3; plane++) +X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane"); + +// TODO: Confirm the address space is continuous +memset(m_count, 0, 3 * sizeof(m_count[0])); +if (m_param->bSaoNonDeblocked) { -for (int j = 0; j < MAX_NUM_SAO_TYPE; j++) -{ -for (int k = 0; k < MAX_NUM_SAO_CLASS; k++) -{ -m_offset[plane][j][k] = 0; -if (m_param->bSaoNonDeblocked) -{ -m_count[plane][j][k] = m_countPreDblk[addr][plane][j][k]; -m_offsetOrg[plane][j][k] = m_offsetOrgPreDblk[addr][plane][j][k]; -} -else -{ -m_count[plane][j][k] = 0; -m_offsetOrg[plane][j][k] = 0; -} -} -} +memcpy(m_count, m_countPreDblk[addr], 3 * sizeof(m_count[0])); +memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], 3 * sizeof(m_offsetOrg[0])); +} +else +{ +memset(m_count, 0, 3 * sizeof(m_count[0])); +memset(m_offsetOrg, 0, 3 * sizeof(m_offsetOrg[0])); +} -saoParam->ctuParam[plane][addr].reset(); -if (saoParam->bSaoFlag[plane > 0]) -calcSaoStatsCu(addr, plane); +saoParam->ctuParam[0][addr].reset(); +saoParam->ctuParam[1][addr].reset(); +saoParam->ctuParam[2][addr].reset(); + +if (saoParam->bSaoFlag[0]) +calcSaoStatsCu(addr, 0); + +if (saoParam->bSaoFlag[1]) +{ +calcSaoStatsCu(addr, 1); +calcSaoStatsCu(addr, 2); } saoComponentParamDist(saoParam, addr, addrUp, addrLeft, [0][0], mergeDist); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 02 of 15] Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076344 21600 # Node ID 06510200bc514313f3fe0d4ee6a0b2592b2235a7 # Parent 86bd9477ccea4173c80c4f5105ce90a342b6c91c Optimize Deblock with idle threading and put Deblock into encode loop to accelerate Frame Parallelism --- source/common/threading.h | 18 +++ source/encoder/frameencoder.cpp | 53 +++- source/encoder/framefilter.cpp | 102 ++ source/encoder/framefilter.h| 44 - 4 files changed, 179 insertions(+), 38 deletions(-) diff -r 86bd9477ccea -r 06510200bc51 source/common/threading.h --- a/source/common/threading.h Wed Dec 02 11:12:22 2015 -0600 +++ b/source/common/threading.h Wed Dec 02 11:12:24 2015 -0600 @@ -205,6 +205,15 @@ return ret; } +int getIncr(int n = 1) +{ +EnterCriticalSection(_cs); +int ret = m_val; +m_val += n; +LeaveCriticalSection(_cs); +return ret; +} + void set(int newval) { EnterCriticalSection(_cs); @@ -394,6 +403,15 @@ return ret; } +int getIncr(int n = 1) +{ +pthread_mutex_lock(_mutex); +int ret = m_val; +m_val += n; +pthread_mutex_unlock(_mutex); +return ret; +} + void set(int newval) { pthread_mutex_lock(_mutex); diff -r 86bd9477ccea -r 06510200bc51 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Wed Dec 02 11:12:22 2015 -0600 +++ b/source/encoder/frameencoder.cpp Wed Dec 02 11:12:24 2015 -0600 @@ -124,7 +124,7 @@ m_pool = NULL; } -m_frameFilter.init(top, this, numRows); +m_frameFilter.init(top, this, numRows, numCols); // initialize HRD parameters of SPS if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) @@ -857,7 +857,7 @@ // Called by worker threads void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld) { -uint32_t row = (uint32_t)intRow; +const uint32_t row = (uint32_t)intRow; CTURow& curRow = m_rows[row]; tld.analysis.m_param = m_param; @@ -899,7 +899,7 @@ { ProfileScopeEvent(encodeCTU); -uint32_t col = curRow.completed; +const uint32_t col = curRow.completed; const uint32_t cuAddr = lineStartCUAddr + col; CUData* ctu = curEncData.getPicCTU(cuAddr); ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp); @@ -1089,10 +1089,29 @@ } } +// TODO: move Deblock and SAO to before VBV check + /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */ if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); +/* Deblock with idle threading */ +if (m_param->bEnableLoopFilter) +{ +// TODO: Multiple Threading +// Delay ONE row to avoid Intra Prediction Conflict +if (row > 0) +{ +// Waitting last threading finish +m_frameFilter.m_pdeblock[row - 1].waitForExit(); + +// Processing new group +const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col); +m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol); +m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1); +} +} + if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) { @@ -1153,6 +1172,23 @@ if (m_param->bEnableWavefront) { +/* Processing left Deblock block with current threading */ +if (m_param->bEnableLoopFilter & (row > 0)) +{ +/* TODO: Multiple Threading */ +m_frameFilter.m_pdeblock[row - 1].waitForExit(); + +/* Check to avoid previous row process slower than current row */ +if (row >= 2) +{ +int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(); +while(prevCol != (int)numCols) +prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol); +} +m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols); +m_frameFilter.m_pdeblock[row - 1].processTasks(-1); +} + /* trigger row-wise loop filters */ if (row >= m_filterRowDelay) { @@ -1163,8 +1199,19 @@ enqueueRowFilter(0); tryWakeOne(); } + if (row == m_numRows - 1) { +/* TODO: Early start last row */ +if (m_param->bEnableLoopFilter) +{
[x265] [PATCH 14 of 15] sao: reduce address operators by split into Luma and Chroma path
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076380 21600 # Node ID a6d88a08af3d48cb804aa61819bd45ee685d1f59 # Parent a3a9660c91b8eeb8f70869fc4022f939c01023f0 sao: reduce address operators by split into Luma and Chroma path --- source/encoder/framefilter.cpp |7 +-- source/encoder/sao.cpp | 133 ++-- source/encoder/sao.h |3 +- 3 files changed, 118 insertions(+), 25 deletions(-) diff -r a3a9660c91b8 -r a6d88a08af3d source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Dec 02 11:12:57 2015 -0600 +++ b/source/encoder/framefilter.cppWed Dec 02 11:13:00 2015 -0600 @@ -546,13 +546,10 @@ for(uint32_t col = 0; col < numCols; col++) { if (saoParam->bSaoFlag[0]) - m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[0], row, col, 0); + m_parallelFilter[row].m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], row, col); if (saoParam->bSaoFlag[1]) -{ - m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[1], row, col, 1); - m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[2], row, col, 2); -} + m_parallelFilter[row].m_sao.processSaoUnitCuChroma(saoParam->ctuParam, row, col); } if (encData.m_slice->m_pps->bTransquantBypassEnabled) diff -r a3a9660c91b8 -r a6d88a08af3d source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Dec 02 11:12:57 2015 -0600 +++ b/source/encoder/sao.cppWed Dec 02 11:13:00 2015 -0600 @@ -674,29 +674,21 @@ } /* Process SAO unit */ -void SAO::processSaoUnitCu(SaoCtuParam* ctuParam, int idxY, int idxX, int plane) +void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX) { PicYuv* reconPic = m_frame->m_reconPic; -intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; -uint32_t picWidth = m_param->sourceWidth; +intptr_t stride = reconPic->m_stride; int ctuWidth = g_maxCUSize; int ctuHeight = g_maxCUSize; -if (plane) -{ -picWidth >>= m_hChromaShift; -ctuWidth >>= m_hChromaShift; -ctuHeight >>= m_vChromaShift; -} - int addr = idxY * m_numCuInWidth + idxX; -pixel* rec = reconPic->getPlaneAddr(plane, addr); +pixel* rec = reconPic->getLumaAddr(addr); if (idxX == 0) { for (int i = 0; i < ctuHeight + 1; i++) { -m_tmpL1[plane][i] = rec[0]; +m_tmpL1[0][i] = rec[0]; rec += stride; } } @@ -706,10 +698,10 @@ if (idxX != (m_numCuInWidth - 1)) { -rec = reconPic->getPlaneAddr(plane, addr); +rec = reconPic->getLumaAddr(addr); for (int i = 0; i < ctuHeight + 1; i++) { -m_tmpL2[plane][i] = rec[ctuWidth - 1]; +m_tmpL2[0][i] = rec[ctuWidth - 1]; rec += stride; } } @@ -720,10 +712,10 @@ { if (typeIdx == SAO_BO) { -memset(m_offsetBo[plane], 0, sizeof(m_offsetBo[0])); +memset(m_offsetBo[0], 0, sizeof(m_offsetBo[0])); for (int i = 0; i < SAO_NUM_OFFSET; i++) -m_offsetBo[plane][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); +m_offsetBo[0][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); } else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) { @@ -733,12 +725,115 @@ offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC; for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) -m_offsetEo[plane][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; +m_offsetEo[0][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; } } -processSaoCu(addr, typeIdx, plane); +processSaoCu(addr, typeIdx, 0); } -std::swap(m_tmpL1[plane], m_tmpL2[plane]); +std::swap(m_tmpL1[0], m_tmpL2[0]); +} + +/* Process SAO unit (Chroma only) */ +void SAO::processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX) +{ +PicYuv* reconPic = m_frame->m_reconPic; +intptr_t stride = reconPic->m_strideC; +int ctuWidth = g_maxCUSize; +int ctuHeight = g_maxCUSize; + +{ +ctuWidth >>= m_hChromaShift; +ctuHeight >>= m_vChromaShift; +} + +int addr = idxY * m_numCuInWidth + idxX; +pixel* recCb = reconPic->getCbAddr(addr); +pixel* recCr = reconPic->getCrAddr(addr); + +if (idxX == 0) +{ +for (int i = 0; i < ctuHeight + 1; i++) +
[x265] [PATCH 04 of 15] optimize SAO statistics initialize
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076349 21600 # Node ID 2601513575c7511d109ed906626b126d6e4f29fb # Parent c6e9344c5d47c029344369099ac63d5a52bdb3ed optimize SAO statistics initialize --- source/encoder/sao.cpp | 50 +-- 1 files changed, 27 insertions(+), 23 deletions(-) diff -r c6e9344c5d47 -r 2601513575c7 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Dec 02 11:12:27 2015 -0600 +++ b/source/encoder/sao.cppWed Dec 02 11:12:29 2015 -0600 @@ -1146,30 +1146,34 @@ if (allowMerge[1]) m_entropyCoder.codeSaoMerge(0); m_entropyCoder.store(m_rdContexts.temp); + // reset stats Y, Cb, Cr -for (int plane = 0; plane < 3; plane++) +X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane"); + +// TODO: Confirm the address space is continuous +memset(m_count, 0, 3 * sizeof(m_count[0])); +if (m_param->bSaoNonDeblocked) { -for (int j = 0; j < MAX_NUM_SAO_TYPE; j++) -{ -for (int k = 0; k < MAX_NUM_SAO_CLASS; k++) -{ -m_offset[plane][j][k] = 0; -if (m_param->bSaoNonDeblocked) -{ -m_count[plane][j][k] = m_countPreDblk[addr][plane][j][k]; -m_offsetOrg[plane][j][k] = m_offsetOrgPreDblk[addr][plane][j][k]; -} -else -{ -m_count[plane][j][k] = 0; -m_offsetOrg[plane][j][k] = 0; -} -} -} +memcpy(m_count, m_countPreDblk[addr], 3 * sizeof(m_count[0])); +memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], 3 * sizeof(m_offsetOrg[0])); +} +else +{ +memset(m_count, 0, 3 * sizeof(m_count[0])); +memset(m_offsetOrg, 0, 3 * sizeof(m_offsetOrg[0])); +} -saoParam->ctuParam[plane][addr].reset(); -if (saoParam->bSaoFlag[plane > 0]) -calcSaoStatsCu(addr, plane); +saoParam->ctuParam[0][addr].reset(); +saoParam->ctuParam[1][addr].reset(); +saoParam->ctuParam[2][addr].reset(); + +if (saoParam->bSaoFlag[0]) +calcSaoStatsCu(addr, 0); + +if (saoParam->bSaoFlag[1]) +{ +calcSaoStatsCu(addr, 1); +calcSaoStatsCu(addr, 2); } saoComponentParamDist(saoParam, addr, addrUp, addrLeft, [0][0], mergeDist); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 10 of 15] sao: change left column copy logic, move copy code outside of processSaoCu()
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076368 21600 # Node ID 82f6a10f44b88400f0f875025b9e8b6caff3acd3 # Parent 9f0c22a2e067f1035fedb636ed505f8539908bfd sao: change left column copy logic, move copy code outside of processSaoCu() --- source/encoder/sao.cpp | 37 + 1 files changed, 13 insertions(+), 24 deletions(-) diff -r 9f0c22a2e067 -r 82f6a10f44b8 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Dec 02 11:12:45 2015 -0600 +++ b/source/encoder/sao.cppWed Dec 02 11:12:48 2015 -0600 @@ -307,17 +307,8 @@ memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */ -{ -const pixel* recR = [ctuWidth - 1]; -for (int i = 0; i < ctuHeight + 1; i++) -{ -m_tmpL2[i] = *recR; -recR += stride; -} - -tmpL = m_tmpL1; -tmpU = &(m_tmpU[plane][lpelx]); -} +tmpL = m_tmpL1; +tmpU = &(m_tmpU[plane][lpelx]); switch (typeIdx) { @@ -593,9 +584,6 @@ } default: break; } - -// if (iSaoType!=SAO_BO_0 || iSaoType!=SAO_BO_1) -std::swap(m_tmpL1, m_tmpL2); } /* Process SAO all units */ @@ -630,6 +618,16 @@ bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT; int typeIdx = ctuParam[addr].typeIdx; +if (idxX != (m_numCuInWidth - 1)) +{ +rec = reconPic->getPlaneAddr(plane, addr); +for (int i = 0; i < ctuHeight + 1; i++) +{ +m_tmpL2[i] = rec[ctuWidth - 1]; +rec += stride; +} +} + if (typeIdx >= 0) { if (!mergeLeftFlag) @@ -654,16 +652,7 @@ } processSaoCu(addr, typeIdx, plane); } -else if (idxX != (m_numCuInWidth - 1)) -{ -rec = reconPic->getPlaneAddr(plane, addr); - -for (int i = 0; i < ctuHeight + 1; i++) -{ -m_tmpL1[i] = rec[ctuWidth - 1]; -rec += stride; -} -} +std::swap(m_tmpL1, m_tmpL2); } } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 05 of 15] move SAO into class ParallelFilter and modify it to row based
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076352 21600 # Node ID eb20b66eebe7e9de04cec0f98f1c3c43e678fcf5 # Parent 2601513575c7511d109ed906626b126d6e4f29fb move SAO into class ParallelFilter and modify it to row based --- source/common/common.h |1 + source/encoder/frameencoder.cpp | 36 +++--- source/encoder/framefilter.cpp | 95 +- source/encoder/framefilter.h| 14 +++--- source/encoder/sao.cpp | 81 - source/encoder/sao.h|7 ++- 6 files changed, 151 insertions(+), 83 deletions(-) diff -r 2601513575c7 -r eb20b66eebe7 source/common/common.h --- a/source/common/common.hWed Dec 02 11:12:29 2015 -0600 +++ b/source/common/common.hWed Dec 02 11:12:32 2015 -0600 @@ -215,6 +215,7 @@ #define X265_MALLOC(type, count)(type*)x265_malloc(sizeof(type) * (count)) #define X265_FREE(ptr) x265_free(ptr) +#define X265_FREE_ZERO(ptr) x265_free(ptr); (ptr) = NULL #define CHECKED_MALLOC(var, type, count) \ { \ var = (type*)x265_malloc(sizeof(type) * (count)); \ diff -r 2601513575c7 -r eb20b66eebe7 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Wed Dec 02 11:12:29 2015 -0600 +++ b/source/encoder/frameencoder.cpp Wed Dec 02 11:12:32 2015 -0600 @@ -1093,7 +1093,7 @@ /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */ if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) -m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); + m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); /* Deblock with idle threading */ if (m_param->bEnableLoopFilter) @@ -1103,24 +1103,24 @@ if (row > 0) { // Waitting last threading finish -m_frameFilter.m_pdeblock[row - 1].waitForExit(); +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); // Processing new group -const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(), (int)col) : col); -m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol); -m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1); +const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(), (int)col) : col); +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol); +m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1); } // Last Row may start early if (row == m_numRows - 1) { // Waitting last threading finish -m_frameFilter.m_pdeblock[row].waitForExit(); +m_frameFilter.m_parallelFilter[row].waitForExit(); // Processing last row -const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 1].m_lastCol.get(), (int)col) : col); -m_frameFilter.m_pdeblock[row].m_allowedCol.set(allowCol); -m_frameFilter.m_pdeblock[row].tryBondPeers(*this, 1); +const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get(), (int)col) : col); +m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol); +m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1); } } @@ -1188,17 +1188,17 @@ if (m_param->bEnableLoopFilter & (row > 0)) { /* TODO: Multiple Threading */ -m_frameFilter.m_pdeblock[row - 1].waitForExit(); +m_frameFilter.m_parallelFilter[row - 1].waitForExit(); /* Check to avoid previous row process slower than current row */ if (row >= 2) { -int prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.get(); +int prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get(); while(prevCol != (int)numCols) -prevCol = m_frameFilter.m_pdeblock[row - 2].m_lastCol.waitForChange(prevCol); +prevCol = m_frameFilter.m_parallelFilter[row - 2].m_lastCol.waitForChange(prevCol); } -m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(numCols); -m_frameFilter.m_pdeblock[row - 1].processTasks(-1); +m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols); +m_frameFilter.m_parallelFilter[row - 1].processTasks(-1); } /* trigger row-wise loop filters */ @@ -1217,12 +1217,12 @@ /* TODO: Early start last row */ if (m_param->bEnable
[x265] [PATCH 06 of 15] sao: merge tmpU1 and tmpU2 into tmpU, and copy these above reference pixels in every row based thread
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076356 21600 # Node ID 1c6f6e627722c767bb9484064a1cea6286c62103 # Parent eb20b66eebe7e9de04cec0f98f1c3c43e678fcf5 sao: merge tmpU1 and tmpU2 into tmpU, and copy these above reference pixels in every row based thread --- source/encoder/frameencoder.cpp |7 ++ source/encoder/framefilter.cpp | 44 +++--- source/encoder/framefilter.h|3 ++ source/encoder/sao.cpp | 39 + source/encoder/sao.h|3 +- 5 files changed, 53 insertions(+), 43 deletions(-) diff -r eb20b66eebe7 -r 1c6f6e627722 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Wed Dec 02 11:12:32 2015 -0600 +++ b/source/encoder/frameencoder.cpp Wed Dec 02 11:12:36 2015 -0600 @@ -1124,6 +1124,13 @@ } } +/* Case of DEBLOCK Disable and SAO Enable */ +if (!m_param->bEnableLoopFilter && m_param->bEnableSAO) +{ +PicYuv* reconPic = curEncData.m_reconPic; +m_frameFilter.m_parallelFilter[row].copySaoAboveRef(reconPic, cuAddr, col); +} + if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) { diff -r eb20b66eebe7 -r 1c6f6e627722 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Dec 02 11:12:32 2015 -0600 +++ b/source/encoder/framefilter.cppWed Dec 02 11:12:36 2015 -0600 @@ -69,7 +69,7 @@ if (m_param->bEnableSsim) m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3)); -if (m_param->bEnableLoopFilter) +if (m_param->bEnableLoopFilter | m_param->bEnableSAO) m_parallelFilter = new ParallelFilter[numRows]; if (m_parallelFilter) @@ -91,6 +91,7 @@ for(int row = 0; row < numRows; row++) { +m_parallelFilter[row].m_param = m_param; m_parallelFilter[row].m_rowAddr = row * numCols; m_parallelFilter[row].m_frameEncoder = m_frameEncoder; } @@ -117,17 +118,39 @@ m_parallelFilter[row].m_encData = frame->m_encData; } -// Reset SAO global/common statistics +// Reset SAO common statistics if (m_param->bEnableSAO) m_parallelFilter[0].m_sao.resetStats(); } } +void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col) +{ +// Copy SAO Top Reference Pixels +int ctuWidth = g_maxCUSize; +const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride); + +// Luma +memcpy(_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel)); +X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected"); + +// Chroma +ctuWidth >>= m_sao.m_hChromaShift; + +const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC); +const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC); +memcpy(_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel)); +memcpy(_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel)); + +X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected"); +} + // NOTE: Single Threading only void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/) { const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms; const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap; +PicYuv* reconPic = m_encData->m_reconPic; const int colStart = m_lastCol.get(); // TODO: Waiting previous row finish or simple clip on it? const int colEnd = m_allowedCol.get(); @@ -146,6 +169,9 @@ { const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1); deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR); + +if (m_param->bEnableSAO) +copySaoAboveRef(reconPic, cuAddr - 1, col - 1); } m_lastCol.incr(); } @@ -155,6 +181,9 @@ const uint32_t cuAddr = m_rowAddr + numCols - 1; const CUData* ctuPrev = m_encData->getPicCTU(cuAddr); deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR); + +if (m_param->bEnableSAO) +copySaoAboveRef(reconPic, cuAddr, numCols - 1); } } @@ -507,23 +536,12 @@ SAOParam* saoParam = encData.m_saoParam; if (saoParam->bSaoFlag[0]) -{ m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0); -if (row != m_numRows - 1) -{ -memcpy(m_pa
[x265] [PATCH 11 of 15] sao: split SAO Left reference pixel buffer into row base
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076371 21600 # Node ID 3a423fcb4b4089de2c05a9067556f20a6fca0d1b # Parent 82f6a10f44b88400f0f875025b9e8b6caff3acd3 sao: split SAO Left reference pixel buffer into row base --- source/encoder/sao.cpp | 35 +-- source/encoder/sao.h |4 ++-- 2 files changed, 27 insertions(+), 12 deletions(-) diff -r 82f6a10f44b8 -r 3a423fcb4b40 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Dec 02 11:12:48 2015 -0600 +++ b/source/encoder/sao.cppWed Dec 02 11:12:51 2015 -0600 @@ -87,8 +87,12 @@ m_tmpU[0] = NULL; m_tmpU[1] = NULL; m_tmpU[2] = NULL; -m_tmpL1 = NULL; -m_tmpL2 = NULL; +m_tmpL1[0] = NULL; +m_tmpL1[1] = NULL; +m_tmpL1[2] = NULL; +m_tmpL2[0] = NULL; +m_tmpL2[1] = NULL; +m_tmpL2[2] = NULL; m_depthSaoRate[0][0] = 0; m_depthSaoRate[0][1] = 0; @@ -116,11 +120,12 @@ CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt); -CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1); -CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1); for (int i = 0; i < 3; i++) { +CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1); +CHECKED_MALLOC(m_tmpL2[i], pixel, g_maxCUSize + 1); + // SAO asm code will read 1 pixel before and after, so pad by 2 // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2); @@ -182,11 +187,21 @@ { X265_FREE_ZERO(m_clipTableBase); -X265_FREE_ZERO(m_tmpL1); -X265_FREE_ZERO(m_tmpL2); for (int i = 0; i < 3; i++) { +if (m_tmpL1[i]) +{ +X265_FREE(m_tmpL1[i]); +m_tmpL1[i] = NULL; +} + +if (m_tmpL2[i]) +{ +X265_FREE(m_tmpL2[i]); +m_tmpL2[i] = NULL; +} + if (m_tmpU[i]) { X265_FREE(m_tmpU[i] - 1); @@ -307,7 +322,7 @@ memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */ -tmpL = m_tmpL1; +tmpL = m_tmpL1[plane]; tmpU = &(m_tmpU[plane][lpelx]); switch (typeIdx) @@ -607,7 +622,7 @@ for (int i = 0; i < ctuHeight + 1; i++) { -m_tmpL1[i] = rec[0]; +m_tmpL1[plane][i] = rec[0]; rec += stride; } @@ -623,7 +638,7 @@ rec = reconPic->getPlaneAddr(plane, addr); for (int i = 0; i < ctuHeight + 1; i++) { -m_tmpL2[i] = rec[ctuWidth - 1]; +m_tmpL2[plane][i] = rec[ctuWidth - 1]; rec += stride; } } @@ -652,7 +667,7 @@ } processSaoCu(addr, typeIdx, plane); } -std::swap(m_tmpL1, m_tmpL2); +std::swap(m_tmpL1[plane], m_tmpL2[plane]); } } diff -r 82f6a10f44b8 -r 3a423fcb4b40 source/encoder/sao.h --- a/source/encoder/sao.h Wed Dec 02 11:12:48 2015 -0600 +++ b/source/encoder/sao.h Wed Dec 02 11:12:51 2015 -0600 @@ -93,8 +93,8 @@ pixel* m_clipTableBase; pixel* m_tmpU[3]; -pixel* m_tmpL1; -pixel* m_tmpL2; +pixel* m_tmpL1[3]; +pixel* m_tmpL2[3]; public: ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 09 of 15] nits: cleanup unused code
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076365 21600 # Node ID 9f0c22a2e067f1035fedb636ed505f8539908bfd # Parent 51648157396685c5d5ac793a8a0209cc8d4a5e4f nits: cleanup unused code --- source/encoder/sao.cpp |4 +--- 1 files changed, 1 insertions(+), 3 deletions(-) diff -r 516481573966 -r 9f0c22a2e067 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Dec 02 11:12:41 2015 -0600 +++ b/source/encoder/sao.cppWed Dec 02 11:12:45 2015 -0600 @@ -623,8 +623,6 @@ rec += stride; } -rec -= (stride << 1); - for (int idxX = 0; idxX < m_numCuInWidth; idxX++) { addr = idxY * m_numCuInWidth + idxX; @@ -658,7 +656,7 @@ } else if (idxX != (m_numCuInWidth - 1)) { -rec = plane ? reconPic->getChromaAddr(plane, addr) : reconPic->getLumaAddr(addr); +rec = reconPic->getPlaneAddr(plane, addr); for (int i = 0; i < ctuHeight + 1; i++) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 15 of 15] sao: cleanup unused processSaoUnitRow()
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076385 21600 # Node ID 330cfe6d0528fd5016f658fcc7f14ddf986dc61f # Parent a6d88a08af3d48cb804aa61819bd45ee685d1f59 sao: cleanup unused processSaoUnitRow() --- source/encoder/sao.cpp | 70 1 files changed, 0 insertions(+), 70 deletions(-) diff -r a6d88a08af3d -r 330cfe6d0528 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Dec 02 11:13:00 2015 -0600 +++ b/source/encoder/sao.cppWed Dec 02 11:13:05 2015 -0600 @@ -603,76 +603,6 @@ } } -/* Process SAO all units */ -void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane) -{ -PicYuv* reconPic = m_frame->m_reconPic; -intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; -uint32_t picWidth = m_param->sourceWidth; -int ctuWidth = g_maxCUSize; -int ctuHeight = g_maxCUSize; - -if (plane) -{ -picWidth >>= m_hChromaShift; -ctuWidth >>= m_hChromaShift; -ctuHeight >>= m_vChromaShift; -} - -int addr = idxY * m_numCuInWidth; -pixel* rec = reconPic->getPlaneAddr(plane, addr); - -for (int i = 0; i < ctuHeight + 1; i++) -{ -m_tmpL1[plane][i] = rec[0]; -rec += stride; -} - -for (int idxX = 0; idxX < m_numCuInWidth; idxX++) -{ -addr = idxY * m_numCuInWidth + idxX; - -bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT; -int typeIdx = ctuParam[addr].typeIdx; - -if (idxX != (m_numCuInWidth - 1)) -{ -rec = reconPic->getPlaneAddr(plane, addr); -for (int i = 0; i < ctuHeight + 1; i++) -{ -m_tmpL2[plane][i] = rec[ctuWidth - 1]; -rec += stride; -} -} - -if (typeIdx >= 0) -{ -if (!mergeLeftFlag) -{ -if (typeIdx == SAO_BO) -{ -memset(m_offsetBo[plane], 0, sizeof(m_offsetBo[0])); - -for (int i = 0; i < SAO_NUM_OFFSET; i++) -m_offsetBo[plane][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); -} -else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) -{ -int offset[NUM_EDGETYPE]; -offset[0] = 0; -for (int i = 0; i < SAO_NUM_OFFSET; i++) -offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC; - -for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) -m_offsetEo[plane][edgeType] = (int8_t)offset[s_eoTable[edgeType]]; -} -} -processSaoCu(addr, typeIdx, plane); -} -std::swap(m_tmpL1[plane], m_tmpL2[plane]); -} -} - /* Process SAO unit */ void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 12 of 15] sao: new CU level process function
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076374 21600 # Node ID b1c261378db29a1988d8e27c5eabe1a76821f83d # Parent 3a423fcb4b4089de2c05a9067556f20a6fca0d1b sao: new CU level process function --- source/encoder/framefilter.cpp | 13 +-- source/encoder/sao.cpp | 68 source/encoder/sao.h |1 + 3 files changed, 78 insertions(+), 4 deletions(-) diff -r 3a423fcb4b40 -r b1c261378db2 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Dec 02 11:12:51 2015 -0600 +++ b/source/encoder/framefilter.cppWed Dec 02 11:12:54 2015 -0600 @@ -541,19 +541,24 @@ { FrameData& encData = *m_frame->m_encData; SAOParam* saoParam = encData.m_saoParam; +uint32_t numCols = encData.m_slice->m_sps->numCuInWidth; if (saoParam->bSaoFlag[0]) -m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0); +{ +for(uint32_t col = 0; col < numCols; col++) + m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[0], row, col, 0); +} if (saoParam->bSaoFlag[1]) { -m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1); -m_parallelFilter[row].m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2); +for(uint32_t col = 0; col < numCols; col++) + m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[1], row, col, 1); +for(uint32_t col = 0; col < numCols; col++) + m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[2], row, col, 2); } if (encData.m_slice->m_pps->bTransquantBypassEnabled) { -uint32_t numCols = encData.m_slice->m_sps->numCuInWidth; uint32_t lineStartCUAddr = row * numCols; const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms; diff -r 3a423fcb4b40 -r b1c261378db2 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Dec 02 11:12:51 2015 -0600 +++ b/source/encoder/sao.cppWed Dec 02 11:12:54 2015 -0600 @@ -671,6 +671,74 @@ } } +/* Process SAO unit */ +void SAO::processSaoUnitCu(SaoCtuParam* ctuParam, int idxY, int idxX, int plane) +{ +PicYuv* reconPic = m_frame->m_reconPic; +intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride; +uint32_t picWidth = m_param->sourceWidth; +int ctuWidth = g_maxCUSize; +int ctuHeight = g_maxCUSize; + +if (plane) +{ +picWidth >>= m_hChromaShift; +ctuWidth >>= m_hChromaShift; +ctuHeight >>= m_vChromaShift; +} + +int addr = idxY * m_numCuInWidth + idxX; +pixel* rec = reconPic->getPlaneAddr(plane, addr); + +if (idxX == 0) +{ +for (int i = 0; i < ctuHeight + 1; i++) +{ +m_tmpL1[plane][i] = rec[0]; +rec += stride; +} +} + +bool mergeLeftFlag = (ctuParam[addr].mergeMode == SAO_MERGE_LEFT); +int typeIdx = ctuParam[addr].typeIdx; + +if (idxX != (m_numCuInWidth - 1)) +{ +rec = reconPic->getPlaneAddr(plane, addr); +for (int i = 0; i < ctuHeight + 1; i++) +{ +m_tmpL2[plane][i] = rec[ctuWidth - 1]; +rec += stride; +} +} + +if (typeIdx >= 0) +{ +if (!mergeLeftFlag) +{ +if (typeIdx == SAO_BO) +{ +memset(m_offsetBo, 0, sizeof(m_offsetBo)); + +for (int i = 0; i < SAO_NUM_OFFSET; i++) +m_offsetBo[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC); +} +else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3) +{ +int offset[NUM_EDGETYPE]; +offset[0] = 0; +for (int i = 0; i < SAO_NUM_OFFSET; i++) +offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC; + +for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++) +m_offsetEo[edgeType] = (int8_t)offset[s_eoTable[edgeType]]; +} +} +processSaoCu(addr, typeIdx, plane); +} +std::swap(m_tmpL1[plane], m_tmpL2[plane]); +} + void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc) { saoUnitDst->mergeMode = saoUnitSrc->mergeMode; diff -r 3a423fcb4b40 -r b1c261378db2 source/encoder/sao.h --- a/source/encoder/sao.h Wed Dec 02 11:12:51 2015 -0600 +++ b/source/encoder/sao.h Wed Dec 02 11:12:54 2015 -0600 @@ -132,6 +132,7 @@ // CTU-based SAO process without slice granularity void processSaoCu(int addr, int typeIdx, int plane); void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane); +void processS
[x265] [PATCH 08 of 15] remove reduce SAO context initialize
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076361 21600 # Node ID 51648157396685c5d5ac793a8a0209cc8d4a5e4f # Parent 015698a0de808459f496f78ac7bcb7e6eefc706f remove reduce SAO context initialize --- source/encoder/framefilter.cpp |4 1 files changed, 0 insertions(+), 4 deletions(-) diff -r 015698a0de80 -r 516481573966 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Dec 02 11:12:39 2015 -0600 +++ b/source/encoder/framefilter.cppWed Dec 02 11:12:41 2015 -0600 @@ -218,10 +218,6 @@ SAOParam* saoParam = encData.m_saoParam; if (m_param->bEnableSAO) { - m_parallelFilter[row].m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext); - m_parallelFilter[row].m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext); - m_parallelFilter[row].m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext); - m_parallelFilter[row].m_sao.rdoSaoUnitRow(saoParam, row); // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug? ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 03 of 15] improve Parallel Deblock last row process
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076347 21600 # Node ID c6e9344c5d47c029344369099ac63d5a52bdb3ed # Parent 06510200bc514313f3fe0d4ee6a0b2592b2235a7 improve Parallel Deblock last row process --- source/encoder/frameencoder.cpp | 13 + 1 files changed, 13 insertions(+), 0 deletions(-) diff -r 06510200bc51 -r c6e9344c5d47 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Wed Dec 02 11:12:24 2015 -0600 +++ b/source/encoder/frameencoder.cpp Wed Dec 02 11:12:27 2015 -0600 @@ -1110,6 +1110,18 @@ m_frameFilter.m_pdeblock[row - 1].m_allowedCol.set(allowCol); m_frameFilter.m_pdeblock[row - 1].tryBondPeers(*this, 1); } + +// Last Row may start early +if (row == m_numRows - 1) +{ +// Waitting last threading finish +m_frameFilter.m_pdeblock[row].waitForExit(); + +// Processing last row +const int allowCol = ((row >= 2) ? X265_MIN(m_frameFilter.m_pdeblock[row - 1].m_lastCol.get(), (int)col) : col); +m_frameFilter.m_pdeblock[row].m_allowedCol.set(allowCol); +m_frameFilter.m_pdeblock[row].tryBondPeers(*this, 1); +} } if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && @@ -1208,6 +1220,7 @@ X265_CHECK(m_frameFilter.m_pdeblock[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed"); /* NOTE: Last Row not execute before, so didn't need wait */ +m_frameFilter.m_pdeblock[row].waitForExit(); m_frameFilter.m_pdeblock[row].m_allowedCol.set(numCols); m_frameFilter.m_pdeblock[row].processTasks(-1); } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 13 of 15] sao: avoid thread conflict on offsetEo and offsetBo
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076377 21600 # Node ID a3a9660c91b8eeb8f70869fc4022f939c01023f0 # Parent b1c261378db29a1988d8e27c5eabe1a76821f83d sao: avoid thread conflict on offsetEo and offsetBo --- source/encoder/framefilter.cpp | 12 +--- source/encoder/sao.cpp | 38 -- source/encoder/sao.h |4 ++-- 3 files changed, 27 insertions(+), 27 deletions(-) diff -r b1c261378db2 -r a3a9660c91b8 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Dec 02 11:12:54 2015 -0600 +++ b/source/encoder/framefilter.cppWed Dec 02 11:12:57 2015 -0600 @@ -543,18 +543,16 @@ SAOParam* saoParam = encData.m_saoParam; uint32_t numCols = encData.m_slice->m_sps->numCuInWidth; -if (saoParam->bSaoFlag[0]) +for(uint32_t col = 0; col < numCols; col++) { -for(uint32_t col = 0; col < numCols; col++) +if (saoParam->bSaoFlag[0]) m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[0], row, col, 0); -} -if (saoParam->bSaoFlag[1]) -{ -for(uint32_t col = 0; col < numCols; col++) +if (saoParam->bSaoFlag[1]) +{ m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[1], row, col, 1); -for(uint32_t col = 0; col < numCols; col++) m_parallelFilter[row].m_sao.processSaoUnitCu(saoParam->ctuParam[2], row, col, 2); +} } if (encData.m_slice->m_pps->bTransquantBypassEnabled) diff -r b1c261378db2 -r a3a9660c91b8 source/encoder/sao.cpp --- a/source/encoder/sao.cppWed Dec 02 11:12:54 2015 -0600 +++ b/source/encoder/sao.cppWed Dec 02 11:12:57 2015 -0600 @@ -325,6 +325,8 @@ tmpL = m_tmpL1[plane]; tmpU = &(m_tmpU[plane][lpelx]); +int8_t* offsetEo = m_offsetEo[plane]; + switch (typeIdx) { case SAO_EO_0: // dir: - @@ -343,7 +345,7 @@ int edgeType = signRight + signLeft + 2; signLeft = -signRight; -rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; +rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; } rec += stride; @@ -368,7 +370,7 @@ row1LastPxl = rec[stride + ctuWidth - 1]; } -primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, signLeft1, stride); +primitives.saoCuOrgE0(rec, offsetEo, ctuWidth, signLeft1, stride); if (!lpelx) { @@ -407,7 +409,7 @@ int edgeType = signDown + upBuff1[x] + 2; upBuff1[x] = -signDown; -rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; +rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; } rec += stride; @@ -420,11 +422,11 @@ int diff = (endY - startY) % 2; for (y = startY; y < endY - diff; y += 2) { -primitives.saoCuOrgE1_2Rows(rec, upBuff1, m_offsetEo, stride, ctuWidth); +primitives.saoCuOrgE1_2Rows(rec, upBuff1, offsetEo, stride, ctuWidth); rec += 2 * stride; } if (diff & 1) -primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth); +primitives.saoCuOrgE1(rec, upBuff1, offsetEo, stride, ctuWidth); } break; @@ -474,7 +476,7 @@ int8_t signDown = signOf(rec[x] - rec[x + stride + 1]); int edgeType = signDown + upBuff1[x] + 2; upBufft[x + 1] = -signDown; - rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; + rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; } std::swap(upBuff1, upBufft); @@ -488,7 +490,7 @@ { int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]); -primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, m_offsetEo, endX - startX, stride); +primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride); upBufft[startX] = iSignDown2; @@ -520,14 +522,14 @@ int8_t signDown = signOf(rec[x] - tmpL[y + 1]); int edgeType = signDown + upBuff1[x] + 2; upBuff1[x - 1] = -signDown; -rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]]; +rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]]; for (x = startX + 1; x < endX; x++) { signDown = signOf(rec[x] - rec[x + stride - 1]); edgeType = signDown + upBuff1[x] + 2; up
[x265] [PATCH 07 of 15] simplify control logic on Deblock Disable and Sao Enable
# HG changeset patch # User Min Chen <chenm...@163.com> # Date 1449076359 21600 # Node ID 015698a0de808459f496f78ac7bcb7e6eefc706f # Parent 1c6f6e627722c767bb9484064a1cea6286c62103 simplify control logic on Deblock Disable and Sao Enable --- source/encoder/frameencoder.cpp | 18 ++ source/encoder/framefilter.cpp | 23 +-- 2 files changed, 23 insertions(+), 18 deletions(-) diff -r 1c6f6e627722 -r 015698a0de80 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Wed Dec 02 11:12:36 2015 -0600 +++ b/source/encoder/frameencoder.cpp Wed Dec 02 11:12:39 2015 -0600 @@ -104,7 +104,8 @@ m_param = top->m_param; m_numRows = numRows; m_numCols = numCols; -m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ? +m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked) +|| (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ? 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0); m_filterRowDelayCus = m_filterRowDelay * numCols; m_rows = new CTURow[m_numRows]; @@ -1096,11 +1097,11 @@ m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); /* Deblock with idle threading */ -if (m_param->bEnableLoopFilter) +if (m_param->bEnableLoopFilter | m_param->bEnableSAO) { // TODO: Multiple Threading // Delay ONE row to avoid Intra Prediction Conflict -if (row > 0) +if (row >= 1) { // Waitting last threading finish m_frameFilter.m_parallelFilter[row - 1].waitForExit(); @@ -1124,13 +1125,6 @@ } } -/* Case of DEBLOCK Disable and SAO Enable */ -if (!m_param->bEnableLoopFilter && m_param->bEnableSAO) -{ -PicYuv* reconPic = curEncData.m_reconPic; -m_frameFilter.m_parallelFilter[row].copySaoAboveRef(reconPic, cuAddr, col); -} - if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) { @@ -1192,7 +1186,7 @@ if (m_param->bEnableWavefront) { /* Processing left Deblock block with current threading */ -if (m_param->bEnableLoopFilter & (row > 0)) +if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 1)) { /* TODO: Multiple Threading */ m_frameFilter.m_parallelFilter[row - 1].waitForExit(); @@ -1222,7 +1216,7 @@ if (row == m_numRows - 1) { /* TODO: Early start last row */ -if (m_param->bEnableLoopFilter) +if (m_param->bEnableLoopFilter | m_param->bEnableSAO) { X265_CHECK(m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.get() == (int)numCols, "Deblock m_EncodedCol check failed"); diff -r 1c6f6e627722 -r 015698a0de80 source/encoder/framefilter.cpp --- a/source/encoder/framefilter.cppWed Dec 02 11:12:36 2015 -0600 +++ b/source/encoder/framefilter.cppWed Dec 02 11:12:39 2015 -0600 @@ -162,13 +162,20 @@ for (uint32_t col = (uint32_t)colStart; col < (uint32_t)colEnd; col++) { const uint32_t cuAddr = m_rowAddr + col; -const CUData* ctu = m_encData->getPicCTU(cuAddr); -deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER); + +if (m_param->bEnableLoopFilter) +{ +const CUData* ctu = m_encData->getPicCTU(cuAddr); +deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER); +} if (col > 0) { -const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1); -deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR); +if (m_param->bEnableLoopFilter) +{ +const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1); +deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR); +} if (m_param->bEnableSAO) copySaoAboveRef(reconPic, cuAddr - 1, col - 1); @@ -179,8 +186,12 @@ if (colEnd == (int)numCols) { const uint32_t cuAddr = m_rowAddr + numCols - 1; -const CUData* ctuPrev = m_encData->getPicCTU(cuAddr); -deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR); + +if (m_param->bEnableLoopFilter) +{ +const CUData* ctuPrev = m_encData->getPicCTU(cuAddr); +deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR); +} if (m_param->bEnableSAO) copySaoAboveRef(reconPic, cuAddr, numCols - 1); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel