Thanks for the contribution Dan. It may take a little while to review this patch, as this is a new platform.
Tom -----Original Message----- From: x265-devel [mailto:[email protected]] On Behalf Of Dan Parrot Sent: Tuesday, March 1, 2016 7:24 AM To: [email protected] Subject: [x265] [PATCH] Vectorize code for PowerPC processors using GCC Altivec API # HG changeset patch # User Dan Parrot <[email protected]> # Date 1456842340 21600 # Tue Mar 01 08:25:40 2016 -0600 # Node ID ffe6ea584ad92364e2e17a02bcb02124607b1e69 # Parent 291beccb67606494a9a144ca2cc4411ab3e21e50 Vectorize code for PowerPC processors using GCC Altivec API. CMake CMAKE_CXX_FLAGS must include -maltivec and -mabi=altivec for GCC to generate the vectorized code. diff -r 291beccb6760 -r ffe6ea584ad9 source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Fri Feb 26 16:23:56 2016 +0530 +++ b/source/common/CMakeLists.txt Tue Mar 01 08:25:40 2016 -0600 @@ -89,7 +89,7 @@ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h) # add ARM assembly/intrinsic files here - set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S) + set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") @@ -115,10 +115,20 @@ set(WINXP winxp.h winxp.cpp) endif(WIN32) +# detect flags that enable powerpc altivec interface string(REGEX MATCH +"-maltivec" ENAB_ALTIVEC_FLAG0 "${CMAKE_CXX_FLAGS}") string(REGEX MATCH +"-mabi=altivec" ENAB_ALTIVEC_FLAG1 "${CMAKE_CXX_FLAGS}") + +if(ENAB_ALTIVEC_FLAG0 AND ENAB_ALTIVEC_FLAG1) + set(SCALAR_OR_VEC_SRCS ppc_altivec/ipfilter.cpp +ppc_altivec/ppcaltivecinline.h) +else() + set(SCALAR_OR_VEC_SRCS ipfilter.cpp) +endif() + add_library(common OBJECT ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${WINXP} primitives.cpp primitives.h - pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp + pixel.cpp dct.cpp ${SCALAR_OR_VEC_SRCS} intrapred.cpp + loopfilter.cpp constants.cpp constants.h cpu.cpp cpu.h version.cpp threading.cpp threading.h diff -r 291beccb6760 -r ffe6ea584ad9 source/common/ppc_altivec/ipfilter.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/ppc_altivec/ipfilter.cpp Tue Mar 01 08:25:40 2016 -0600 @@ -0,0 +1,675 @@ +/********************************************************************** +******* + * Copyright (C) 2013 x265 project + * + * Authors: Deepthi Devaki <[email protected]>, + * Rajesh Paulraj <[email protected]> + * Praveen Kumar Tiwari <[email protected]> + * Min Chen <[email protected]> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + +*********************************************************************** +******/ + +#include <altivec.h> + +#include "common.h" +#include "primitives.h" +#include "x265.h" + +using namespace X265_NS; + +#if _MSC_VER +#pragma warning(disable: 4127) // conditional expression is constant, +typical for templated functions #endif + +#include "ppcaltivecinline.h" + +namespace { +// file local namespace +template<int width, int height> +void filterPixelToShort_c(const pixel* src, intptr_t srcStride, +int16_t* dst, intptr_t dstStride) { + vector unsigned short vShift = + vec_sub(vec_splats((unsigned short)IF_INTERNAL_PREC), +vec_splats((unsigned short)X265_DEPTH)); + + vector signed short vOffset = vec_splats((signed + short)IF_INTERNAL_OFFS); + + int row, col; + int srcItemCnt, dstItemCntA, dstItemCntB; + + vector signed char vMask; + vector signed char vDst = vec_splats((signed char)0); + vector signed char vRead = vec_splats((signed char)0); + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col += 16) + { + srcItemCnt = (width - col) > 16 ? 16 : (width - col); + dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB); + + read_qword(src, col, srcItemCnt, width, &vRead); + vMask = vec_splats((signed char)-1); + compute_filter_ps(vRead, &vDst, vShift, vOffset, dstItemCntA, &vMask); + store_value(dstItemCntA * 2, (signed char*)((unsigned + long)dst + 2 * col), vDst, vMask); + + vRead = vec_sld(vRead, vec_splats((signed char)0), 8); + if(dstItemCntB) + { + vMask = vec_splats((signed char)-1); + compute_filter_ps(vRead, &vDst, vShift, vOffset, dstItemCntB, &vMask); + store_value(dstItemCntB * 2, (signed char*)((unsigned long)dst + 2 * col + 16), vDst, vMask); + } + } + src += srcStride; + dst += dstStride; + } +} + +static void extendCURowColBorder(pixel* txt, intptr_t stride, int +width, int height, int marginX) { + int itemCnt; + vector signed char vSrcLeft; + vector signed char vSrcRight; + vector signed char vMask = vec_splats((signed char)-1); + vector unsigned char vPerm; + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < marginX; x += 16) + { + itemCnt = (marginX - x) > 16 ? 16 : (marginX - x); + + if (x == 0) + { + vPerm = vec_lvsl(0, (signed char*)txt); + vSrcLeft = vec_ld(0, (signed char*)txt); + vSrcLeft = vec_perm(vSrcLeft, vSrcLeft, vPerm); + + vPerm = vec_lvsl(0, (signed char*)&txt[width - 1]); + vSrcRight = vec_ld(0, (signed char*)&txt[width - 1]); + vSrcRight = vec_perm(vSrcRight, vSrcRight, vPerm); + + vSrcLeft = vec_splat(vSrcLeft, 0); + vSrcRight = vec_splat(vSrcRight, 0); + } + + //left-align the result + for(int k = itemCnt; k < 16; k++) + { + vMask = vec_sld(vMask, vec_splats((signed char)0), 1); + } + + store_value(itemCnt, (signed char*)&txt[-marginX + x], vSrcLeft, vMask); + store_value(itemCnt, (signed char*)&txt[width + x], + vSrcRight, vMask); + + } + txt += stride; + } +} + +template<int N, int width, int height> +void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* +dst, intptr_t dstStride, int coeffIdx) { + vector unsigned int vHeadRoom = vec_splats((unsigned int)IF_FILTER_PREC); + vector signed int vOffset = vec_splats((int)1); + vOffset = vec_sl(vOffset, vec_sub(vHeadRoom, vec_splats((unsigned +int)1))); + + vector unsigned short vMaxVal = vec_splats((unsigned short)1); + vMaxVal = vec_sl(vMaxVal, vec_splats((unsigned short)X265_DEPTH)); + vMaxVal = vec_sub(vMaxVal, vec_splats((unsigned short)1)); + + src -= (N / 2 - 1); + + vector signed short vCoeff; + v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients + + int row, col; + int srcItemCnt; + + vector signed char vMask; + vector signed char vDstA, vDstB; + + vector signed char vReadArr[8]; + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col += 16) + { + srcItemCnt = (width - col) > 16 ? 16 : (width - col); + + read_src_p(0, src, col, 1, srcItemCnt, vReadArr); + if(N == 8) + { + read_src_p(4, src, col, 1, srcItemCnt, vReadArr); + } + compute_pp(N, vReadArr, &vDstA, vOffset, vHeadRoom, vCoeff, vMaxVal, true); + compute_pp(N, vReadArr, &vDstB, vOffset, vHeadRoom, vCoeff, + vMaxVal, false); + + vDstA = vec_pack((vector signed short)vDstA, (vector signed + short)vDstB); + + vMask = vec_splats((signed char)-1); + //left-align the result + for(int k = srcItemCnt; k < 16; k++) + { + vMask = vec_sld(vMask, vec_splats((signed char)0), 1); + } + store_value(srcItemCnt, (signed char*)&dst[col], vDstA, + vMask); + + } + src += srcStride; + dst += dstStride; + } +} + +template<int N, int width, int height> +void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* +dst, intptr_t dstStride, int coeffIdx, int isRowExt) { + vector unsigned int vHeadRoom = vec_sub(vec_splats((unsigned int)IF_INTERNAL_PREC), vec_splats((unsigned int)X265_DEPTH)); + vector unsigned int vShift = vec_sub(vec_splats((unsigned int)IF_FILTER_PREC), vHeadRoom); + vector signed char vConstZero = vec_splats((signed char)0); + vector signed int vOffset = vec_splats((int)IF_INTERNAL_OFFS); + vOffset = vec_sub((vector signed int)vConstZero, vOffset); + vOffset = vec_sl(vOffset, vShift); + + int blkheight = height; + + src -= N / 2 - 1; + + if (isRowExt) + { + src -= (N / 2 - 1) * srcStride; + blkheight += N - 1; + } + + vector signed short vCoeff; + v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients + + int row, col; + int srcItemCnt, dstItemCntA, dstItemCntB; + + vector signed char vMask; + vector signed char vDst = vec_splats((signed char)0); + + vector signed char vReadArr[8]; + + for (row = 0; row < blkheight; row++) + { + for (col = 0; col < width; col += 16) + { + srcItemCnt = (width - col) > 16 ? 16 : (width - col); + dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB); + + read_src_p(0, src, col, 1, srcItemCnt, vReadArr); + if(N == 8) + { + read_src_p(4, src, col, 1, srcItemCnt, vReadArr); + } + + vMask = vec_splats((signed char)-1); + compute_ps(N, dstItemCntA, vReadArr, &vDst, vOffset, vCoeff, vShift, &vMask, true); + store_value(dstItemCntA * 2, (signed char*)&dst[col], vDst, + vMask); + + if(dstItemCntB) + { + vMask = vec_splats((signed char)-1); + compute_ps(N, dstItemCntB, vReadArr, &vDst, vOffset, vCoeff, vShift, &vMask, false); + store_value(dstItemCntB * 2, (signed char*)((unsigned long)&dst[col] + 16), vDst, vMask); + } + } + src += srcStride; + dst += dstStride; + } +} + +template<int N, int width, int height> +void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, +intptr_t dstStride, int coeffIdx) { + vector unsigned int vHeadRoom = vec_splats((unsigned int)IF_FILTER_PREC); + vector signed int vOffset = vec_splats((int)1); + vOffset = vec_sl(vOffset, vec_sub(vHeadRoom, vec_splats((unsigned +int)1))); + + vector unsigned short vMaxVal = vec_splats((unsigned short)1); + vMaxVal = vec_sl(vMaxVal, vec_splats((unsigned short)X265_DEPTH)); + vMaxVal = vec_sub(vMaxVal, vec_splats((unsigned short)1)); + + src -= (N / 2 - 1) * srcStride; + + vector signed short vCoeff; + v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients + + int row, col; + int srcItemCnt; + + vector signed char vMask; + vector signed char vDstA, vDstB; + vector signed char vReadArr[8]; + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col += 16) + { + srcItemCnt = (width - col) > 16 ? 16 : (width - col); + + read_src_p(0, src, col, srcStride, srcItemCnt, vReadArr); + if(N == 8) + { + read_src_p(4, src, col, srcStride, srcItemCnt, vReadArr); + } + compute_pp(N, vReadArr, &vDstA, vOffset, vHeadRoom, vCoeff, vMaxVal, true); + compute_pp(N, vReadArr, &vDstB, vOffset, vHeadRoom, vCoeff, + vMaxVal, false); + + vDstA = vec_pack((vector signed short)vDstA, (vector signed + short)vDstB); + + vMask = vec_splats((signed char)-1); + //left-align the result + for(int k = srcItemCnt; k < 16; k++) + { + vMask = vec_sld(vMask, vec_splats((signed char)0), 1); + } + store_value(srcItemCnt, (signed char*)&dst[col], vDstA, vMask); + } + src += srcStride; + dst += dstStride; + } +} + +template<int N, int width, int height> +void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* +dst, intptr_t dstStride, int coeffIdx) { + vector unsigned int vHeadRoom = vec_sub(vec_splats((unsigned int)IF_INTERNAL_PREC), vec_splats((unsigned int)X265_DEPTH)); + vector unsigned int vShift = vec_sub(vec_splats((unsigned int)IF_FILTER_PREC), vHeadRoom); + vector signed char vConstZero = vec_splats((signed char)0); + vector signed int vOffset = vec_splats((int)IF_INTERNAL_OFFS); + vOffset = vec_sub((vector signed int)vConstZero, vOffset); + vOffset = vec_sl(vOffset, vShift); + + src -= (N / 2 - 1) * srcStride; + + vector signed short vCoeff; + v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients + + int row, col; + int srcItemCnt, dstItemCntA, dstItemCntB; + + vector signed char vMask; + vector signed char vDst = vec_splats((signed char)0); + + vector signed char vReadArr[8]; + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col += 16) + { + srcItemCnt = (width - col) > 16 ? 16 : (width - col); + dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB); + + read_src_p(0, src, col, srcStride, srcItemCnt, vReadArr); + if(N == 8) + { + read_src_p(4, src, col, srcStride, srcItemCnt, vReadArr); + } + + vMask = vec_splats((signed char)-1); + compute_ps(N, dstItemCntA, vReadArr, &vDst, vOffset, vCoeff, vShift, &vMask, true); + store_value(dstItemCntA * 2, (signed char*)((unsigned + long)dst + 2 * col), vDst, vMask); + + if(dstItemCntB) + { + vMask = vec_splats((signed char)-1); + compute_ps(N, dstItemCntB, vReadArr, &vDst, vOffset, vCoeff, vShift, &vMask, false); + store_value(dstItemCntB * 2, (signed char*)((unsigned long)dst + 2 * col + 16), vDst, vMask); + } + } + src += srcStride; + dst += dstStride; + } +} + +template<int N, int width, int height> +void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* +dst, intptr_t dstStride, int coeffIdx) { + vector unsigned int vHeadRoom = vec_sub(vec_splats((unsigned int)IF_INTERNAL_PREC), vec_splats((unsigned int)X265_DEPTH)); + vector unsigned int vShift = vec_add(vec_splats((unsigned int)IF_FILTER_PREC), vHeadRoom); + vector signed int vOffset = vec_splats((int)1); + vOffset = vec_sl(vOffset, vec_sub(vShift, vec_splats((unsigned int)1))); + vector signed int vTemp = vec_splats((int)IF_INTERNAL_OFFS); + vTemp = vec_sl(vTemp, vec_splats((unsigned int)IF_FILTER_PREC)); + vOffset = vec_add(vOffset, vTemp); + + vector unsigned short vMaxVal = vec_splats((unsigned short)1); + vMaxVal = vec_sl(vMaxVal, vec_splats((unsigned short)X265_DEPTH)); + vMaxVal = vec_sub(vMaxVal, vec_splats((unsigned short)1)); + + src -= (N / 2 - 1) * srcStride; + + vector signed short vCoeff; + v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients + + int row, col; + int srcItemCnt, dstItemCntA, dstItemCntB; + + vector signed char vMask; + vector signed char vDstA, vDstB = vec_splats((signed char)0); + vector signed char vReadArr[8]; + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col += 16) + { + srcItemCnt = (width - col) > 16 ? 16 : (width - col); + dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB); + + read_src_s(0, src, col, srcStride, dstItemCntA, vReadArr); + if(N == 8) + { + read_src_s(4, src, col, srcStride, dstItemCntA, vReadArr); + } + compute_vert_sp(N, vReadArr, &vDstA, vOffset, vCoeff, + vShift, vMaxVal); + + if(dstItemCntB) + { + read_src_s(0, src, col + 8, srcStride, dstItemCntB, vReadArr); + if(N == 8) + { + read_src_s(4, src, col + 8, srcStride, dstItemCntB, vReadArr); + } + compute_vert_sp(N, vReadArr, &vDstB, vOffset, vCoeff, vShift, vMaxVal); + } + + vDstA = vec_pack((vector signed short)vDstA, (vector signed + short)vDstB); + + vMask = vec_splats((signed char)-1); + //left-align the result + for(int k = srcItemCnt; k < 16; k++) + { + vMask = vec_sld(vMask, vec_splats((signed char)0), 1); + } + store_value(srcItemCnt, (signed char*)&dst[col], vDstA, vMask); + } + src += srcStride; + dst += dstStride; + } +} + +template<int N, int width, int height> +void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* +dst, intptr_t dstStride, int coeffIdx) { + vector unsigned int vShift = vec_splats((unsigned +int)IF_FILTER_PREC); + + src -= (N / 2 - 1) * srcStride; + + vector signed short vCoeff; + v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients + + int row, col; + int srcItemCnt; + + vector signed char vMask; + vector signed char vDst; + + vector signed char vReadArr[8]; + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col += 8) + { + srcItemCnt = (width - col) > 8 ? 8 : (width - col); + + read_src_s(0, src, col, srcStride, srcItemCnt, vReadArr); + if(N == 8) + { + read_src_s(4, src, col, srcStride, srcItemCnt, vReadArr); + } + compute_vert_ss(N, vReadArr, &vDst, vCoeff, vShift); + + vMask = vec_splats((signed char)-1); + //left-align the result + for(int k = srcItemCnt * 2; k < 16; k++) + { + vMask = vec_sld(vMask, vec_splats((signed char)0), 1); + } + store_value(srcItemCnt * 2, (signed char*)&dst[col], vDst, vMask); + } + src += srcStride; + dst += dstStride; + } +} + +template<int N> +void filterVertical_sp_c(const int16_t* src, intptr_t srcStride, pixel* +dst, intptr_t dstStride, int width, int height, int coeffIdx) { + vector unsigned int vHeadRoom = vec_sub(vec_splats((unsigned int)IF_INTERNAL_PREC), vec_splats((unsigned int)X265_DEPTH)); + vector unsigned int vShift = vec_add(vec_splats((unsigned int)IF_FILTER_PREC), vHeadRoom); + vector signed int vOffset = vec_splats((int)1); + vOffset = vec_sl(vOffset, vec_sub(vShift, vec_splats((unsigned int)1))); + vector signed int vTemp = vec_splats((int)IF_INTERNAL_OFFS); + vTemp = vec_sl(vTemp, vec_splats((unsigned int)IF_FILTER_PREC)); + vOffset = vec_add(vOffset, vTemp); + + vector unsigned short vMaxVal = vec_splats((unsigned short)1); + vMaxVal = vec_sl(vMaxVal, vec_splats((unsigned short)X265_DEPTH)); + vMaxVal = vec_sub(vMaxVal, vec_splats((unsigned short)1)); + + src -= (N / 2 - 1) * srcStride; + + vector signed short vCoeff; + v_load_coeff(N, coeffIdx, &vCoeff); // read the coefficients + + int row, col; + int srcItemCnt, dstItemCntA, dstItemCntB; + + vector signed char vMask; + vector signed char vDstA, vDstB = vec_splats((signed char)0); + vector signed char vReadArr[8]; + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col += 16) + { + srcItemCnt = (width - col) > 16 ? 16 : (width - col); + dst_item_partition(srcItemCnt, &dstItemCntA, &dstItemCntB); + + read_src_s(0, src, col, srcStride, dstItemCntA, vReadArr); + if(N == 8) + { + read_src_s(4, src, col, srcStride, dstItemCntA, vReadArr); + } + compute_vert_sp(N, vReadArr, &vDstA, vOffset, vCoeff, + vShift, vMaxVal); + + if(dstItemCntB) + { + read_src_s(0, src, col + 8, srcStride, dstItemCntB, vReadArr); + if(N == 8) + { + read_src_s(4, src, col + 8, srcStride, dstItemCntB, vReadArr); + } + compute_vert_sp(N, vReadArr, &vDstB, vOffset, vCoeff, vShift, vMaxVal); + } + + vDstA = vec_pack((vector signed short)vDstA, (vector signed + short)vDstB); + + vMask = vec_splats((signed char)-1); + //left-align the result + for(int k = srcItemCnt; k < 16; k++) + { + vMask = vec_sld(vMask, vec_splats((signed char)0), 1); + } + store_value(srcItemCnt, (signed char*)&dst[col], vDstA, vMask); + } + src += srcStride; + dst += dstStride; + } +} + +template<int N, int width, int height> +void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, +intptr_t dstStride, int idxX, int idxY) { + short immedVals[(64 + 8) * (64 + 8)]; + + interp_horiz_ps_c<N, width, height>(src, srcStride, immedVals, width, idxX, 1); + filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, +dstStride, width, height, idxY); } } + +namespace X265_NS { +// x265 private namespace + +#define CHROMA_420(W, H) \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = +filterPixelToShort_c<W, H>; + +#define CHROMA_422(W, H) \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = +filterPixelToShort_c<W, H>; + +#define CHROMA_444(W, H) \ + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hps = interp_horiz_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \ + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \ + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \ + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \ + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = +filterPixelToShort_c<W, H>; + +#define LUMA(W, H) \ + p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_c<8, W, H>; \ + p.pu[LUMA_ ## W ## x ## H].luma_hps = interp_horiz_ps_c<8, W, H>; \ + p.pu[LUMA_ ## W ## x ## H].luma_vpp = interp_vert_pp_c<8, W, H>; \ + p.pu[LUMA_ ## W ## x ## H].luma_vps = interp_vert_ps_c<8, W, H>; \ + p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_c<8, W, H>; \ + p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_c<8, W, H>; \ + p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>; \ + p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, +H>; + +void setupFilterPrimitives_c(EncoderPrimitives& p) { + LUMA(4, 4); + LUMA(8, 8); + CHROMA_420(4, 4); + LUMA(4, 8); + CHROMA_420(2, 4); + LUMA(8, 4); + CHROMA_420(4, 2); + LUMA(16, 16); + CHROMA_420(8, 8); + LUMA(16, 8); + CHROMA_420(8, 4); + LUMA(8, 16); + CHROMA_420(4, 8); + LUMA(16, 12); + CHROMA_420(8, 6); + LUMA(12, 16); + CHROMA_420(6, 8); + LUMA(16, 4); + CHROMA_420(8, 2); + LUMA(4, 16); + CHROMA_420(2, 8); + LUMA(32, 32); + CHROMA_420(16, 16); + LUMA(32, 16); + CHROMA_420(16, 8); + LUMA(16, 32); + CHROMA_420(8, 16); + LUMA(32, 24); + CHROMA_420(16, 12); + LUMA(24, 32); + CHROMA_420(12, 16); + LUMA(32, 8); + CHROMA_420(16, 4); + LUMA(8, 32); + CHROMA_420(4, 16); + LUMA(64, 64); + CHROMA_420(32, 32); + LUMA(64, 32); + CHROMA_420(32, 16); + LUMA(32, 64); + CHROMA_420(16, 32); + LUMA(64, 48); + CHROMA_420(32, 24); + LUMA(48, 64); + CHROMA_420(24, 32); + LUMA(64, 16); + CHROMA_420(32, 8); + LUMA(16, 64); + CHROMA_420(8, 32); + + CHROMA_422(4, 8); + CHROMA_422(4, 4); + CHROMA_422(2, 4); + CHROMA_422(2, 8); + CHROMA_422(8, 16); + CHROMA_422(8, 8); + CHROMA_422(4, 16); + CHROMA_422(8, 12); + CHROMA_422(6, 16); + CHROMA_422(8, 4); + CHROMA_422(2, 16); + CHROMA_422(16, 32); + CHROMA_422(16, 16); + CHROMA_422(8, 32); + CHROMA_422(16, 24); + CHROMA_422(12, 32); + CHROMA_422(16, 8); + CHROMA_422(4, 32); + CHROMA_422(32, 64); + CHROMA_422(32, 32); + CHROMA_422(16, 64); + CHROMA_422(32, 48); + CHROMA_422(24, 64); + CHROMA_422(32, 16); + CHROMA_422(8, 64); + + CHROMA_444(4, 4); + CHROMA_444(8, 8); + CHROMA_444(4, 8); + CHROMA_444(8, 4); + CHROMA_444(16, 16); + CHROMA_444(16, 8); + CHROMA_444(8, 16); + CHROMA_444(16, 12); + CHROMA_444(12, 16); + CHROMA_444(16, 4); + CHROMA_444(4, 16); + CHROMA_444(32, 32); + CHROMA_444(32, 16); + CHROMA_444(16, 32); + CHROMA_444(32, 24); + CHROMA_444(24, 32); + CHROMA_444(32, 8); + CHROMA_444(8, 32); + CHROMA_444(64, 64); + CHROMA_444(64, 32); + CHROMA_444(32, 64); + CHROMA_444(64, 48); + CHROMA_444(48, 64); + CHROMA_444(64, 16); + CHROMA_444(16, 64); + + p.extendRowBorder = extendCURowColBorder; } } diff -r 291beccb6760 -r ffe6ea584ad9 source/common/ppc_altivec/ppcaltivecinline.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/ppc_altivec/ppcaltivecinline.h Tue Mar 01 08:25:40 2016 -0600 @@ -0,0 +1,555 @@ +/********************************************************************** +******* +* Copyright (C) 2015 x265 project +* +* Authors: Dan Parrot <[email protected]> +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +* +* This program is also available under a commercial proprietary license. +* For more information, contact us at license @ x265.com. +*********************************************************************** +******/ +extern inline +void v_load_coeff(int N, int coeffIdx, vector signed short* vCoeff) +__attribute__((always_inline)); + +extern inline +void dst_item_partition(int srcItemCnt, int* dstItemCntA, int* +dstItemCntB) __attribute__((always_inline)); + +void read_src_p(int startIdx, const pixel* src, int col, intptr_t +srcStride, int srcItemCnt, vector signed char vReadArr[]) +__attribute__((always_inline)); + +extern inline +void read_src_s(int startIdx, const int16_t* src, int col, intptr_t +srcStride, int srcItemCnt, vector signed char vReadArr[]) +__attribute__((always_inline)); + +extern inline +void read_qword(const pixel* src, int col, int srcItemCnt, int width, +vector signed char* vRead) __attribute__((always_inline)); + +extern inline +void compute_pp(int N, int itemCnt, vector signed char vReadArr[], +vector signed short* vSrc, vector signed char* vDst, vector signed int +vOffset, vector unsigned int vHeadRoom, vector signed short vCoeff, +vector unsigned short vMaxVal, vector signed char* vMask) +__attribute__((always_inline)); + +extern inline +void compute_vert_ps(int N, int dstItemCnt, vector signed char +vReadArr[], vector signed short* vSrc, vector signed char* vDst, vector +signed int vOffset, vector signed short vCoeff, vector signed int +vShift) __attribute__((always_inline)); + +extern inline +void compute_ps(int N, int dstItemCnt, vector signed char vReadArr[], +vector signed short* vSrc, vector signed char* vDst, vector signed int +vOffset, vector signed short vCoeff, vector unsigned int vShift, vector +signed char* vMask, bool initXfer) __attribute__((always_inline)); + +extern inline +void compute_filter_ps(vector signed char vRead, vector signed char* +vDst, vector unsigned short vShift, vector signed short vOffset, int +itemCnt, vector signed char* vMask) __attribute__((always_inline)); + +extern inline +void mult_add_s(int N, vector signed char vReadArr[], vector signed +short vCoeff, vector signed int vOffset, vector unsigned int vShift, +vector signed short* vVal) __attribute__((always_inline)); + +extern inline +void mult_add_p(int N, vector signed char vReadArr[], vector signed +short vCoeff, vector signed int vOffset, vector unsigned int vShift, +vector signed short* vVal, bool initXfer) +__attribute__((always_inline)); + +extern inline +void compute_vert_sp(int N, vector signed char vReadArr[], vector +signed char* vDst, vector signed int vOffset, vector signed short +vCoeff, vector unsigned int vShift, vector unsigned short vMaxVal) +__attribute__((always_inline)); + +extern inline +void compute_vert_ss(int N, vector signed char vReadArr[], vector +signed char* vDst, vector signed short vCoeff, vector unsigned int +vShift) __attribute__((always_inline)); + +extern inline +void store_value(int dstByteCnt, signed char* dstAddr, vector signed +char vDst, vector signed char vMask) __attribute__((always_inline)); + +extern inline +void v_load_coeff(int N, int coeffIdx, vector signed short* vCoeff) { + const int16_t* coeff; + vector unsigned char vPerm; + vector signed char vHi, vLo; + vector signed char vConstZero = vec_splats((signed char)0); + signed char* addrHi; + signed char* addrLo; + + if(N == 4) coeff = g_chromaFilter[coeffIdx]; + else coeff = g_lumaFilter[coeffIdx]; + + if(N == 8) addrLo = (signed char *)&coeff[7]; + else addrLo = (signed char *)&coeff[3]; + + addrHi = (signed char*)&coeff[0]; + + if(((unsigned long)addrHi & 0x0Ful) == 0) + { + *vCoeff = (vector signed short)vec_ld(0, addrHi); + } + else + { + vPerm = vec_lvsl(0, addrHi); + vHi = vec_ld(0, addrHi); + if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful))) + { + vLo = vec_ld(0, addrLo); + *vCoeff = (vector signed short)vec_perm(vHi, vLo, vPerm); + } + else + { + *vCoeff = (vector signed short)vec_perm(vHi, vHi, vPerm); + } + } + + if(N == 4) + { + *vCoeff = vec_sld((vector signed short)vConstZero, *vCoeff, 8); // this and next line left-align coefficients and + *vCoeff = vec_sld(*vCoeff, (vector signed short)vConstZero, 8); // zero out the lower half of coefficient register. + } +} + +extern inline +void dst_item_partition(int srcItemCnt, int* dstItemCntA, int* +dstItemCntB) { + if(srcItemCnt <= 8) + { + *dstItemCntA = srcItemCnt; + *dstItemCntB = 0; + } + else + { + *dstItemCntA = 8; + *dstItemCntB = srcItemCnt - 8; + } +} + +extern inline +void read_src_p(int startIdx, const pixel* src, int col, intptr_t +srcStride, int srcItemCnt, vector signed char vReadArr[]) { + signed char* addrHi; + signed char *addrLo; + vector unsigned char vPerm; + vector signed char vHi, vLo; + + for(int k = startIdx; k < startIdx + 4; k++) + { + addrHi = (signed char*)&src[col + k * srcStride]; + addrLo = (signed char *)((unsigned long)addrHi + srcItemCnt - + 1); + + if(((unsigned long)addrHi & 0x0Ful) == 0) + { + vReadArr[k] = vec_ld(0, addrHi); + } + else + { + vPerm = vec_lvsl(0, addrHi); + vHi = vec_ld(0, addrHi); + if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful))) + { + vLo = vec_ld(0, addrLo); + vReadArr[k] = vec_perm(vHi, vLo, vPerm); + } + else + { + vReadArr[k] = vec_perm(vHi, vHi, vPerm); + } + } + } +} + +extern inline +void read_src_s(int startIdx, const int16_t* src, int col, intptr_t +srcStride, int srcItemCnt, vector signed char vReadArr[]) { + signed char* addrHi; + signed char *addrLo; + vector unsigned char vPerm; + vector signed char vHi, vLo; + + for(int k = startIdx; k < startIdx + 4; k++) + { + addrHi = (signed char*)&src[col + k * srcStride]; + addrLo = (signed char *)((unsigned long)addrHi + 2 * srcItemCnt + - 1); + + if(((unsigned long)addrHi & 0x0Ful) == 0) + { + vReadArr[k] = vec_ld(0, addrHi); + } + else + { + vPerm = vec_lvsl(0, addrHi); + vHi = vec_ld(0, addrHi); + if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful))) + { + vLo = vec_ld(0, addrLo); + vReadArr[k] = vec_perm(vHi, vLo, vPerm); + } + else + { + vReadArr[k] = vec_perm(vHi, vHi, vPerm); + } + } + } +} + +extern inline +void read_qword(const pixel* src, int col, int srcItemCnt, int width, +vector signed char* vRead) { + signed char* addrHi; + signed char *addrLo; + vector unsigned char vPerm; + vector signed char vHi, vLo; + + addrHi = (signed char*)&src[col]; + addrLo = (srcItemCnt < 16) ? ((signed char *)&src[width - 1]) : + ((signed char *)&src[col + 15]); + + if(((unsigned long)addrHi & 0x0Ful) == 0) + { + *vRead = vec_ld(0, addrHi); + } + else + { + vPerm = vec_lvsl(0, addrHi); + vHi = vec_ld(0, addrHi); + if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful))) + { + vLo = vec_ld(0, addrLo); + *vRead = vec_perm(vHi, vLo, vPerm); + } + else + { + *vRead = vec_perm(vHi, vHi, vPerm); + } + } +} + +extern inline +void compute_pp(int N, vector signed char vReadArr[], vector signed +char* vDst, vector signed int vOffset, vector unsigned int vHeadRoom, +vector signed short vCoeff, vector unsigned short vMaxVal, bool initXfer) { + vector signed short vVal; + vector bool short compare; + + mult_add_p(N, vReadArr, vCoeff, vOffset, vHeadRoom, &vVal, + initXfer); + + compare = vec_cmplt(vVal, vec_splats((signed short)0)); + vVal = vec_sel(vVal, vec_splats((signed short)0), compare); + + compare = vec_cmpgt(vVal, (vector signed short)vMaxVal); + vVal = vec_sel(vVal, (vector signed short)vMaxVal, compare); + + *vDst = (vector signed char)vVal; +} + +void compute_ps(int N, int dstItemCnt, vector signed char vReadArr[], +vector signed char* vDst, vector signed int vOffset, vector signed +short vCoeff, vector unsigned int vShift, vector signed char* vMask, bool initXfer) { + vector signed short vVal; + + mult_add_p(N, vReadArr, vCoeff, vOffset, vShift, &vVal, initXfer); + + *vDst = (vector signed char)vVal; + + // mask to left-align the result when less than 16-bytes + for (int k = dstItemCnt * 2; k < 16; k++) + { + *vMask = vec_sld(*vMask, vec_splats((signed char)0), 1); + } + +} + +extern inline +void compute_filter_ps(vector signed char vRead, vector signed char* +vDst, vector unsigned short vShift, vector signed short vOffset, int +itemCnt, vector signed char* vMask) { + vector signed char vConstZero = vec_splats((signed char)0); + vector signed short vVal = vec_splats((signed short)0); + + vVal = (vector signed short)vec_mergeh(vConstZero, vRead); + vVal = vec_sl(vVal, vShift); + vVal = vec_sub(vVal, vOffset); + + *vDst = (vector signed char)vVal; + + // mask needed when result is less than 16-bytes + for (int k = itemCnt * 2; k < 16; k++) + { + *vMask = vec_sld(*vMask, (vector signed char)vConstZero, 1); + } +} + +extern inline +void mult_add_s(int N, vector signed char vReadArr[], vector signed +short vCoeff, vector signed int vOffset, vector unsigned int vShift, +vector signed short* vVal) { + vector signed int vRsltEven[8]; + vector signed int vRsltOdd[8]; + + if(N == 4) + { + vRsltEven[0] = vec_mule(vec_splat(vCoeff, 0), (vector signed short)vReadArr[0]); + vRsltEven[1] = vec_mule(vec_splat(vCoeff, 1), (vector signed short)vReadArr[1]); + vRsltEven[2] = vec_mule(vec_splat(vCoeff, 2), (vector signed short)vReadArr[2]); + vRsltEven[3] = vec_mule(vec_splat(vCoeff, 3), (vector signed + short)vReadArr[3]); + + vRsltOdd[0] = vec_mulo(vec_splat(vCoeff, 0), (vector signed short)vReadArr[0]); + vRsltOdd[1] = vec_mulo(vec_splat(vCoeff, 1), (vector signed short)vReadArr[1]); + vRsltOdd[2] = vec_mulo(vec_splat(vCoeff, 2), (vector signed short)vReadArr[2]); + vRsltOdd[3] = vec_mulo(vec_splat(vCoeff, 3), (vector signed + short)vReadArr[3]); + + // as convention, always save to lower-numbered of any pair during addition + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[1]); + vRsltEven[2] = vec_add(vRsltEven[2], vRsltEven[3]); + + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[1]); + vRsltOdd[2] = vec_add(vRsltOdd[2], vRsltOdd[3]); + + // the 2 elements below now contain the final mult-sum of 8 elements + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[2]); + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[2]); + } + else + { + vRsltEven[0] = vec_mule(vec_splat(vCoeff, 0), (vector signed short)vReadArr[0]); + vRsltEven[1] = vec_mule(vec_splat(vCoeff, 1), (vector signed short)vReadArr[1]); + vRsltEven[2] = vec_mule(vec_splat(vCoeff, 2), (vector signed short)vReadArr[2]); + vRsltEven[3] = vec_mule(vec_splat(vCoeff, 3), (vector signed short)vReadArr[3]); + vRsltEven[4] = vec_mule(vec_splat(vCoeff, 4), (vector signed short)vReadArr[4]); + vRsltEven[5] = vec_mule(vec_splat(vCoeff, 5), (vector signed short)vReadArr[5]); + vRsltEven[6] = vec_mule(vec_splat(vCoeff, 6), (vector signed short)vReadArr[6]); + vRsltEven[7] = vec_mule(vec_splat(vCoeff, 7), (vector signed + short)vReadArr[7]); + + vRsltOdd[0] = vec_mulo(vec_splat(vCoeff, 0), (vector signed short)vReadArr[0]); + vRsltOdd[1] = vec_mulo(vec_splat(vCoeff, 1), (vector signed short)vReadArr[1]); + vRsltOdd[2] = vec_mulo(vec_splat(vCoeff, 2), (vector signed short)vReadArr[2]); + vRsltOdd[3] = vec_mulo(vec_splat(vCoeff, 3), (vector signed short)vReadArr[3]); + vRsltOdd[4] = vec_mulo(vec_splat(vCoeff, 4), (vector signed short)vReadArr[4]); + vRsltOdd[5] = vec_mulo(vec_splat(vCoeff, 5), (vector signed short)vReadArr[5]); + vRsltOdd[6] = vec_mulo(vec_splat(vCoeff, 6), (vector signed short)vReadArr[6]); + vRsltOdd[7] = vec_mulo(vec_splat(vCoeff, 7), (vector signed + short)vReadArr[7]); + + // as convention, always save to lower-numbered of any pair during addition + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[1]); + vRsltEven[2] = vec_add(vRsltEven[2], vRsltEven[3]); + vRsltEven[4] = vec_add(vRsltEven[4], vRsltEven[5]); + vRsltEven[6] = vec_add(vRsltEven[6], vRsltEven[7]); + + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[1]); + vRsltOdd[2] = vec_add(vRsltOdd[2], vRsltOdd[3]); + vRsltOdd[4] = vec_add(vRsltOdd[4], vRsltOdd[5]); + vRsltOdd[6] = vec_add(vRsltOdd[6], vRsltOdd[7]); + + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[2]); + vRsltEven[4] = vec_add(vRsltEven[4], vRsltEven[6]); + + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[2]); + vRsltOdd[4] = vec_add(vRsltOdd[4], vRsltOdd[6]); + + // the 2 elements below now contain the final mult-sum of 8 elements + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[4]); + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[4]); + } + + vRsltEven[0] = vec_add(vRsltEven[0], vOffset); + vRsltOdd[0] = vec_add(vRsltOdd[0], vOffset); + + vRsltEven[0] = vec_sra(vRsltEven[0], (vector unsigned int)vShift); + vRsltOdd[0] = vec_sra(vRsltOdd[0], (vector unsigned int)vShift); + + *vVal = vec_pack(vRsltEven[0], vRsltOdd[0]); + *vVal = vec_perm(*vVal, *vVal, ((vector unsigned char) + {0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, +0x0c, 0x0d, 0x06, 0x07, 0x0e,0x0f})); } + +extern inline +void mult_add_p(int N, vector signed char vReadArr[], vector signed +short vCoeff, vector signed int vOffset, vector unsigned int vShift, +vector signed short* vVal, bool initXfer) { + vector signed short vOperand[8]; + vector signed int vRsltEven[8]; + vector signed int vRsltOdd[8]; + + + if(initXfer) + { + vOperand[0] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[0]); + vOperand[1] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[1]); + vOperand[2] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[2]); + vOperand[3] = (vector signed + short)vec_mergeh(vec_splats((signed char)0), vReadArr[3]); + + if(N != 4) + { + vOperand[4] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[4]); + vOperand[5] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[5]); + vOperand[6] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[6]); + vOperand[7] = (vector signed short)vec_mergeh(vec_splats((signed char)0), vReadArr[7]); + } + } + else + { + vOperand[0] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[0]); + vOperand[1] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[1]); + vOperand[2] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[2]); + vOperand[3] = (vector signed + short)vec_mergel(vec_splats((signed char)0), vReadArr[3]); + + if(N != 4) + { + vOperand[4] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[4]); + vOperand[5] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[5]); + vOperand[6] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[6]); + vOperand[7] = (vector signed short)vec_mergel(vec_splats((signed char)0), vReadArr[7]); + } + } + + if(N == 4) + { + vRsltEven[0] = vec_mule(vec_splat(vCoeff, 0), vOperand[0]); + vRsltEven[1] = vec_mule(vec_splat(vCoeff, 1), vOperand[1]); + vRsltEven[2] = vec_mule(vec_splat(vCoeff, 2), vOperand[2]); + vRsltEven[3] = vec_mule(vec_splat(vCoeff, 3), vOperand[3]); + + vRsltOdd[0] = vec_mulo(vec_splat(vCoeff, 0), vOperand[0]); + vRsltOdd[1] = vec_mulo(vec_splat(vCoeff, 1), vOperand[1]); + vRsltOdd[2] = vec_mulo(vec_splat(vCoeff, 2), vOperand[2]); + vRsltOdd[3] = vec_mulo(vec_splat(vCoeff, 3), vOperand[3]); + + // as convention, always save to lower-numbered of any pair during addition + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[1]); + vRsltEven[2] = vec_add(vRsltEven[2], vRsltEven[3]); + + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[1]); + vRsltOdd[2] = vec_add(vRsltOdd[2], vRsltOdd[3]); + + // the 2 elements below now contain the final mult-sum of 8 elements + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[2]); + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[2]); + } + else + { + vRsltEven[0] = vec_mule(vec_splat(vCoeff, 0), vOperand[0]); + vRsltEven[1] = vec_mule(vec_splat(vCoeff, 1), vOperand[1]); + vRsltEven[2] = vec_mule(vec_splat(vCoeff, 2), vOperand[2]); + vRsltEven[3] = vec_mule(vec_splat(vCoeff, 3), vOperand[3]); + vRsltEven[4] = vec_mule(vec_splat(vCoeff, 4), vOperand[4]); + vRsltEven[5] = vec_mule(vec_splat(vCoeff, 5), vOperand[5]); + vRsltEven[6] = vec_mule(vec_splat(vCoeff, 6), vOperand[6]); + vRsltEven[7] = vec_mule(vec_splat(vCoeff, 7), vOperand[7]); + + vRsltOdd[0] = vec_mulo(vec_splat(vCoeff, 0), vOperand[0]); + vRsltOdd[1] = vec_mulo(vec_splat(vCoeff, 1), vOperand[1]); + vRsltOdd[2] = vec_mulo(vec_splat(vCoeff, 2), vOperand[2]); + vRsltOdd[3] = vec_mulo(vec_splat(vCoeff, 3), vOperand[3]); + vRsltOdd[4] = vec_mulo(vec_splat(vCoeff, 4), vOperand[4]); + vRsltOdd[5] = vec_mulo(vec_splat(vCoeff, 5), vOperand[5]); + vRsltOdd[6] = vec_mulo(vec_splat(vCoeff, 6), vOperand[6]); + vRsltOdd[7] = vec_mulo(vec_splat(vCoeff, 7), vOperand[7]); + + // as convention, always save to lower-numbered of any pair during addition + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[1]); + vRsltEven[2] = vec_add(vRsltEven[2], vRsltEven[3]); + vRsltEven[4] = vec_add(vRsltEven[4], vRsltEven[5]); + vRsltEven[6] = vec_add(vRsltEven[6], vRsltEven[7]); + + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[1]); + vRsltOdd[2] = vec_add(vRsltOdd[2], vRsltOdd[3]); + vRsltOdd[4] = vec_add(vRsltOdd[4], vRsltOdd[5]); + vRsltOdd[6] = vec_add(vRsltOdd[6], vRsltOdd[7]); + + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[2]); + vRsltEven[4] = vec_add(vRsltEven[4], vRsltEven[6]); + + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[2]); + vRsltOdd[4] = vec_add(vRsltOdd[4], vRsltOdd[6]); + + // the 2 elements below now contain the final mult-sum of 8 elements + vRsltEven[0] = vec_add(vRsltEven[0], vRsltEven[4]); + vRsltOdd[0] = vec_add(vRsltOdd[0], vRsltOdd[4]); + } + + vRsltEven[0] = vec_add(vRsltEven[0], vOffset); + vRsltOdd[0] = vec_add(vRsltOdd[0], vOffset); + + vRsltEven[0] = vec_sra(vRsltEven[0], (vector unsigned int)vShift); + vRsltOdd[0] = vec_sra(vRsltOdd[0], (vector unsigned int)vShift); + + *vVal = vec_pack(vRsltEven[0], vRsltOdd[0]); + *vVal = vec_perm(*vVal, *vVal, ((vector unsigned char) + {0x00, 0x01, 0x08, 0x09, 0x02, 0x03, 0x0a, 0x0b, 0x04, 0x05, +0x0c, 0x0d, 0x06, 0x07, 0x0e,0x0f})); } + +extern inline +void compute_vert_sp(int N, vector signed char vReadArr[], vector +signed char* vDst, vector signed int vOffset, vector signed short +vCoeff, vector unsigned int vShift, vector unsigned short vMaxVal) { + vector signed short vVal; + vector bool short compare; + + mult_add_s(N, vReadArr, vCoeff, vOffset, vShift, &vVal); + + compare = vec_cmplt(vVal, vec_splats((signed short)0)); + vVal = vec_sel(vVal, vec_splats((signed short)0), compare); + + compare = vec_cmpgt(vVal, (vector signed short)vMaxVal); + vVal = vec_sel(vVal, (vector signed short)vMaxVal, compare); + + *vDst = (vector signed char)vVal; +} + +extern inline +void compute_vert_ss(int N, vector signed char vReadArr[], vector +signed char* vDst, vector signed short vCoeff, vector unsigned int +vShift) { + vector signed short vVal; + + mult_add_s(N, vReadArr, vCoeff, vec_splats((signed int)0), vShift, + &vVal); + + *vDst = (vector signed char)vVal; +} + +extern inline +void store_value(int dstByteCnt, signed char* dstAddr, vector signed +char vDst, vector signed char vMask) { + signed char* addrHi = dstAddr; + signed char* addrLo = (signed char*)((unsigned long)dstAddr + +dstByteCnt - 1); + + vector unsigned char vPerm = vec_lvsr(0, addrHi); + vector signed char vHi = vec_ld(0, addrHi); + vector signed char vLo = vec_splats((signed char)0); + + vDst = vec_perm(vDst, vDst, vPerm); + vMask = vec_perm(vec_splats((signed char)0), vMask, vPerm); + vHi = vec_sel(vHi, vDst, (vector unsigned char)vMask); + vec_st(vHi, 0, addrHi); + + if(((unsigned long)addrHi & ~(0x0Ful)) != ((unsigned long)addrLo & ~(0x0Ful))) + { + vLo = vec_ld(0, addrLo); + vLo = vec_sel(vDst, vLo, (vector unsigned char)vMask); + vec_st(vLo, 0, addrLo); + } +} _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
