# HG changeset patch # User Pooja Venkatesan <po...@multicorewareinc.com> # Date 1566813889 -19800 # Mon Aug 26 15:34:49 2019 +0530 # Node ID 96739081845ad6186aee54c61f69de1f30abff33 # Parent 00b686782ad0c60dff48a1a001417814ce9a20dd Scale pictures using bicubic or bilinear algorithm.
diff -r 00b686782ad0 -r 96739081845a source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Thu Feb 13 11:57:33 2020 +0530 +++ b/source/common/CMakeLists.txt Mon Aug 26 15:34:49 2019 +0530 @@ -168,4 +168,5 @@ predict.cpp predict.h scalinglist.cpp scalinglist.h quant.cpp quant.h contexts.h - deblock.cpp deblock.h) + deblock.cpp deblock.h + scaler.cpp scaler.h) diff -r 00b686782ad0 -r 96739081845a source/common/scaler.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/scaler.cpp Mon Aug 26 15:34:49 2019 +0530 @@ -0,0 +1,1087 @@ +#include "scaler.h" + +#if _MSC_VER +#pragma warning(disable: 4706) // assignment within conditional +#pragma warning(disable: 4244) // '=' : possible loss of data +#endif + +#define SHORT_MIN (-(1 << 15)) +#define SHORT_MAX ((1 << 15) - 1) +#define SHORT_MAX_10 ((1 << 10) - 1) + +namespace X265_NS{ + +ScalerFilterManager::ScalerFilterManager() : + m_bitDepth(0), + m_algorithmFlags(0), + m_srcW(0), + m_srcH(0), + m_dstW(0), + m_dstH(0), + m_crSrcW(0), + m_crSrcH(0), + m_crDstW(0), + m_crDstH(0), + m_crSrcHSubSample(0), + m_crSrcVSubSample(0), + m_crDstHSubSample(0), + m_crDstVSubSample(0) +{ + for (int i = 0; i < m_numSlice; i++) + m_slices[i] = NULL; + for (int i = 0; i < m_numFilter; i++) + m_ScalerFilters[i] = NULL; +} + +inline static void filter_copy_c(int64_t* filter, int64_t* filter2, int size) +{ + for (int i = 0; i < size; i++) + filter2[i] = filter[i]; +} + +#if X265_DEPTH == 8 +static void doScaling_c(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) +{ + for (int i = 0; i < dstW; i++) + { + int val = 0; + int sourcePos = filterPos[i]; + for (int j = 0; j < filterSize; j++) + val += ((int)src[sourcePos + j]) * filter[filterSize * i + j]; + // the cubic equation does overflow ... + dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 7); + } +} +static uint8_t clipUint8(int a) +{ + if (a&(~0xFF)) + return (-a) >> 31; + else + return a; +} + +static void yuv2PlaneX_c(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) +{ + for (int i = 0; i < dstW; i++) + { + int val = 64 << 12; + for (int j = 0; j < filterSize; j++) + val += src[j][i] * filter[j]; + dest[i] = clipUint8(val >> 19); + } +} +#else +static void yuv2PlaneX_c_h(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) +{ + for (int i = 0; i < dstW; i++) + { + int val = 1 << 16; + uint16_t* dst16bit = (uint16_t *)dest; + for (int j = 0; j < filterSize; j++) + val += src[j][i] * filter[j]; + uint16_t d = x265_clip3(0, SHORT_MAX_10, val >> 17); + ((uint8_t*)(&dst16bit[i]))[0] = (d); + ((uint8_t*)(&dst16bit[i]))[1] = (d) >> 8; + } +} +static void doScaling_c_h(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) +{ + const uint16_t *srcLocal = (const uint16_t *)src; + for (int i = 0; i < dstW; i++) + { + int val = 0; + int sourcePos = filterPos[i]; + for (int j = 0; j < filterSize; j++) + val += ((int)srcLocal[sourcePos + j]) * filter[filterSize * i + j]; + // the cubic equation does overflow + dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 9); + } +} +#endif + +ScalerFilter::ScalerFilter() : + m_filtLen(0), + m_filtPos(NULL), + m_filt(NULL), + m_sourceSlice(NULL), + m_destSlice(NULL) +{ +} + +ScalerFilter::~ScalerFilter() +{ + if (m_filtPos) { + delete[] m_filtPos; m_filtPos = NULL; + } + if (m_filt) { + delete[] m_filt; m_filt = NULL; + } +} + +void ScalerHLumFilter::process(int sliceVer, int sliceHor) +{ + uint8_t ** src = m_sourceSlice->m_plane[0].lineBuf; + uint8_t ** dst = m_destSlice->m_plane[0].lineBuf; + int sourcePos = sliceVer - m_sourceSlice->m_plane[0].sliceVer; + int destPos = sliceVer - m_destSlice->m_plane[0].sliceVer; + int dstW = m_destSlice->m_width; + for (int i = 0; i < sliceHor; ++i) + { + m_hFilterScaler->doScaling((int16_t*)dst[destPos + i], dstW, (const uint8_t *)src[sourcePos + i], m_filt, m_filtPos, m_filtLen); + m_destSlice->m_plane[0].sliceHor += 1; + } +} + +void ScalerHCrFilter::process(int sliceVer, int sliceHor) +{ + uint8_t ** src1 = m_sourceSlice->m_plane[1].lineBuf; + uint8_t ** dst1 = m_destSlice->m_plane[1].lineBuf; + uint8_t ** src2 = m_sourceSlice->m_plane[2].lineBuf; + uint8_t ** dst2 = m_destSlice->m_plane[2].lineBuf; + + int sourcePos1 = sliceVer - m_sourceSlice->m_plane[1].sliceVer; + int destPos1 = sliceVer - m_destSlice->m_plane[1].sliceVer; + int sourcePos2 = sliceVer - m_sourceSlice->m_plane[2].sliceVer; + int destPos2 = sliceVer - m_destSlice->m_plane[2].sliceVer; + + int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample; + + for (int i = 0; i < sliceHor; ++i) + { + m_hFilterScaler->doScaling((int16_t*)dst1[destPos1 + i], dstW, src1[sourcePos1 + i], m_filt, m_filtPos, m_filtLen); + m_hFilterScaler->doScaling((int16_t*)dst2[destPos2 + i], dstW, src2[sourcePos2 + i], m_filt, m_filtPos, m_filtLen); + m_destSlice->m_plane[1].sliceHor += 1; + m_destSlice->m_plane[2].sliceHor += 1; + } +} + +void VFilterScaler8Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) +{ + int IdxW = FACTOR_4; + int IdxF = FIL_DEF; + + (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4); + +#if X265_DEPTH == 8 + yuv2PlaneX_c(filter, filterSize, src, dest, dstW); +#else + yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW); +#endif +} + +void VFilterScaler10Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) +{ + int IdxW = FACTOR_4; + int IdxF = FIL_DEF; + + (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4); + +#if X265_DEPTH == 8 + yuv2PlaneX_c(filter, filterSize, src, dest, dstW); +#else + yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW); +#endif +} + +void ScalerVLumFilter::process(int sliceVer, int sliceHor) +{ + (void)sliceHor; + int first = X265_MAX(1 - m_filtLen, m_filtPos[sliceVer]); + int sp = first - m_sourceSlice->m_plane[0].sliceVer; + int dp = sliceVer - m_destSlice->m_plane[0].sliceVer; + uint8_t **src = m_sourceSlice->m_plane[0].lineBuf + sp; + uint8_t **dst = m_destSlice->m_plane[0].lineBuf + dp; + int16_t *filter = m_filt + (sliceVer * m_filtLen); + int dstW = m_destSlice->m_width; + m_vFilterScaler->yuv2PlaneX(filter, m_filtLen, (const int16_t**)src, dst[0], dstW); +} + +void ScalerVCrFilter::process(int sliceVer, int sliceHor) +{ + (void)sliceHor; + + const int crSkipMask = (1 << m_destSlice->m_vCrSubSample) - 1; + if (sliceVer & crSkipMask) + return; + else + { + int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample; + int crSliceVer = sliceVer >> m_destSlice->m_vCrSubSample; + int first = X265_MAX(1 - m_filtLen, m_filtPos[crSliceVer]); + int sp1 = first - m_sourceSlice->m_plane[1].sliceVer; + int sp2 = first - m_sourceSlice->m_plane[2].sliceVer; + int dp1 = crSliceVer - m_destSlice->m_plane[1].sliceVer; + int dp2 = crSliceVer - m_destSlice->m_plane[2].sliceVer; + uint8_t **src1 = m_sourceSlice->m_plane[1].lineBuf + sp1; + uint8_t **src2 = m_sourceSlice->m_plane[2].lineBuf + sp2; + uint8_t **dst1 = m_destSlice->m_plane[1].lineBuf + dp1; + uint8_t **dst2 = m_destSlice->m_plane[2].lineBuf + dp2; + int16_t *filter = m_filt + (crSliceVer * m_filtLen); + + m_vFilterScaler->yuv2PlaneX((int16_t*)filter, m_filtLen, (const int16_t**)src1, dst1[0], dstW); + m_vFilterScaler->yuv2PlaneX((int16_t*)filter, m_filtLen, (const int16_t**)src2, dst2[0], dstW); + } +} + +int ScalerFilter::initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos) +{ + int filterSize; + int filter2Size; + int minFilterSize; + int64_t *filter = NULL; + int64_t *filter2 = NULL; + const int64_t fone = 1LL << (54 - x265_min((int)X265_LOG2(srcW / dstW), 8)); + int *outFilterSize = &m_filtLen; + int64_t xDstInSrc; + int sizeFactor = flag; + + // Init filter pos, the +3 is for the MMX(+1) / SSE(+3) scaler which reads over the end + m_filtPos = new int32_t[dstW + 3]; + int32_t **filterPos = &m_filtPos; + + if (inc <= 1 << 16) + filterSize = 1 + sizeFactor; // upscale + else + filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW; + + filterSize = x265_min(filterSize, srcW - 2); + filterSize = x265_max(filterSize, 1); + filter = new int64_t[dstW * sizeof(*filter) * filterSize]; + + xDstInSrc = ((destPos*(int64_t)inc) >> 7) - ((sourcePos * 0x10000LL) >> 7); + for (int i = 0; i < dstW; i++) + { + int xx = (xDstInSrc - (filterSize - 2) * (1LL << 16)) / (1 << 17); + (*filterPos)[i] = xx; + for (int j = 0; j < filterSize; j++) + { + int64_t d = (X265_ABS(((int64_t)xx * (1 << 17)) - xDstInSrc)) << 13; + int64_t coeff = 0; + + if (inc > 1 << 16) + d = d * dstW / srcW; + + if (flag == 4) // BiCUBIC + { + int64_t B = (0) * (1 << 24); + int64_t C = (0.6) * (1 << 24); + + if (d >= 1LL << 31) + coeff = 0.0; + else + { + int64_t dd = (d * d) >> 30; + int64_t ddd = (dd * d) >> 30; + + if (d < 1LL << 30) + coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd + (-18 * (1 << 24) + 12 * B + 6 * C) * dd + (6 * (1 << 24) - 2 * B) * (1 << 30); + else + coeff = (-B - 6 * C) * ddd + (6 * B + 30 * C) * dd + (-12 * B - 48 * C) * d + (8 * B + 24 * C) * (1 << 30); + } + coeff /= (1LL << 54) / fone; + } + else if (flag == 1) // BILINEAR + { + coeff = (1 << 30) - d; + if (coeff < 0) + coeff = 0; + coeff *= fone >> 30; + } + else + assert(0); + + filter[i * filterSize + j] = coeff; + xx++; + } + xDstInSrc += 2 * inc; + } + + //apply src & dst Filter to filter -> filter2 + X265_CHECK(filterSize > 0, "invalid filterSize value.\n"); + filter2Size = filterSize; + filter2 = new int64_t[dstW * sizeof(*filter2) * filter2Size]; + + /* This is hard to read code, but much faster. Speed is crucial here */ + int index = RES_FACTOR_DEF; + int size = dstW * filterSize; + + (size % 4 == 0) && (index = RES_FACTOR_4); + (size % 8 == 0) && (index = RES_FACTOR_8); + (size % 16 == 0) && (index = RES_FACTOR_16); + (size % 32 == 0) && (index = RES_FACTOR_32); + (size % 64 == 0) && (index = RES_FACTOR_64); + + filter_copy_c(filter, filter2, size); + + delete[](filter); + + // try to reduce the filter-size (step1 find size and shift left) + // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not). + minFilterSize = 0; + for (int i = dstW - 1; i >= 0; i--) + { + int min = filter2Size; + int64_t cutOff = 0.0; + + // get rid of near zero elements on the left by shifting left + for (int j = 0; j < filter2Size; j++) + { + int k; + cutOff += X265_ABS(filter2[i * filter2Size]); + + if (cutOff > SCALER_MAX_REDUCE_CUTOFF * fone) + break; + // preserve monotonicity because the core can't handle the filter otherwise + if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1]) + break; + + // move filter coefficients left + for (k = 1; k < filter2Size; k++) + filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k]; + filter2[i * filter2Size + k - 1] = 0; + (*filterPos)[i]++; + } + + cutOff = 0; + // count near zeros on the right + for (int j = filter2Size - 1; j > 0; j--) + { + cutOff += X265_ABS(filter2[i * filter2Size + j]); + + if (cutOff > SCALER_MAX_REDUCE_CUTOFF * fone) + break; + min--; + } + + if (min > minFilterSize) + minFilterSize = min; + } + + X265_CHECK(minFilterSize > 0, "invalid minFilterSize value.\n"); + filterSize = (minFilterSize + (filtAlign - 1)) & (~(filtAlign - 1)); + X265_CHECK(filterSize > 0, "invalid filterSize value.\n"); + filter = new int64_t[dstW*filterSize * sizeof(*filter)]; + + *outFilterSize = filterSize; + + // try to reduce the filter-size (step2 reduce it) + for (int i = 0; i < dstW; i++) + { + for (int j = 0; j < filterSize; j++) + { + if (j >= filter2Size) + filter[i * filterSize + j] = 0; + else + filter[i * filterSize + j] = filter2[i * filter2Size + j]; + if ((flag & SCALER_BITEXACT) && j >= minFilterSize) + filter[i * filterSize + j] = 0; + } + } + + // fix borders + for (int i = 0; i < dstW; i++) + { + int j; + if ((*filterPos)[i] < 0) + { + // move filter coefficients left to compensate for filterPos + for (j = 1; j < filterSize; j++) + { + int left = x265_max(j + (*filterPos)[i], 0); + filter[i * filterSize + left] += filter[i * filterSize + j]; + filter[i * filterSize + j] = 0; + } + (*filterPos)[i] = 0; + } + + if ((*filterPos)[i] + filterSize > srcW) + { + int shift = (*filterPos)[i] + x265_min(filterSize - srcW, 0); + int64_t acc = 0; + + for (j = filterSize - 1; j >= 0; j--) + { + if ((*filterPos)[i] + j >= srcW) + { + acc += filter[i * filterSize + j]; + filter[i * filterSize + j] = 0; + } + } + for (j = filterSize - 1; j >= 0; j--) + { + if (j < shift) + filter[i * filterSize + j] = 0; + else + filter[i * filterSize + j] = filter[i * filterSize + j - shift]; + } + + (*filterPos)[i] -= shift; + filter[i * filterSize + srcW - 1 - (*filterPos)[i]] += acc; + } + + X265_CHECK((*filterPos)[i] >= 0, "invalid: Value of (*filterPos)[%d] < 0.\n", i); + X265_CHECK((*filterPos)[i] < srcW, "invalid: Value of (*filterPos)[%d] > %d .\n", i, srcW); + if ((*filterPos)[i] + filterSize > srcW) + { + for (j = 0; j < filterSize; j++) + { + X265_CHECK(!filter[i * filterSize + j], "invalid: Value of filter[%d * filterSize + %d] != 0.\n", i, j); + X265_CHECK((*filterPos)[i] + j < srcW, "invalid: (*filterPos)[%d] + %d > %d .\n", i, i, srcW); + } + } + } + + // init filter + m_filt = new int16_t[(dstW + 3)*(*outFilterSize)]; + int16_t **outFilter = &m_filt; + + // normalize & store in outFilter + for (int i = 0; i < dstW; i++) + { + int64_t error = 0; + int64_t sum = 0; + + for (int j = 0; j < filterSize; j++) + sum += filter[i * filterSize + j]; + sum = (sum + one / 2) / one; + if (!sum) + { + x265_log(NULL, X265_LOG_WARNING, "Scaler: zero vector in scaling\n"); + sum = 1; + } + for (int j = 0; j < *outFilterSize; j++) + { + int64_t v = filter[i * filterSize + j] + error; + int intV = ROUNDED_DIVISION(v, sum); + (*outFilter)[i * (*outFilterSize) + j] = intV; + error = v - intV * sum; + } + } + + (*filterPos)[dstW + 0] = + (*filterPos)[dstW + 1] = + (*filterPos)[dstW + 2] = (*filterPos)[dstW - 1]; + for (int i = 0; i < *outFilterSize; i++) + { + int k = (dstW - 1) * (*outFilterSize) + i; + (*outFilter)[k + 1 * (*outFilterSize)] = + (*outFilter)[k + 2 * (*outFilterSize)] = + (*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k]; + } + + delete[](filter); + delete[](filter2); + return 0; +} + +int ScalerFilterManager::init(int algorithmFlags, VideoDesc *srcVideoDesc, VideoDesc *dstVideoDesc) +{ + int srcW = m_srcW = srcVideoDesc->m_width; + int srcH = m_srcH = srcVideoDesc->m_height; + int dstW = m_dstW = dstVideoDesc->m_width; + int dstH = m_dstH = dstVideoDesc->m_height; + int lumXInc, crXInc; + int lumYInc, crYInc; + int srcHCrPos; + int dstHCrPos; + int srcVCrPos; + int dstVCrPos; + int dst_stride = SCALER_ALIGN(dstW * sizeof(int16_t) + 66, 16); + m_bitDepth = dstVideoDesc->m_inputDepth; + if (m_bitDepth == 16) + dst_stride <<= 1; + + m_algorithmFlags = algorithmFlags; + lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW; + lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH; + + srcHCrPos = -513; + dstHCrPos = -513; + srcVCrPos = -513; + dstVCrPos = -513; + + int srcCsp = srcVideoDesc->m_csp; + if (x265_cli_csps[srcCsp].planes > 1) + { + m_crSrcHSubSample = x265_cli_csps[srcCsp].width[1]; + m_crSrcVSubSample = x265_cli_csps[srcCsp].height[1]; + m_crSrcW = srcVideoDesc->m_width >> m_crSrcHSubSample; + m_crSrcH = srcVideoDesc->m_height >> m_crSrcVSubSample; + if (srcCsp == 1)// i420 + srcVCrPos = 128; + } + else + { + m_crSrcW = 0; + m_crSrcH = 0; + m_crSrcHSubSample = 0; + m_crSrcVSubSample = 0; + } + int dstCsp = dstVideoDesc->m_csp; + if (x265_cli_csps[dstCsp].planes > 1) + { + m_crDstHSubSample = x265_cli_csps[dstCsp].width[1]; + m_crDstVSubSample = x265_cli_csps[dstCsp].height[1]; + m_crDstW = dstVideoDesc->m_width >> m_crDstHSubSample; + m_crDstH = dstVideoDesc->m_height >> m_crDstVSubSample; + if (dstCsp == 1)// i420 + dstVCrPos = 128; + } + else + { + m_crDstW = 0; + m_crDstH = 0; + m_crDstHSubSample = 0; + m_crDstVSubSample = 0; + } + // Only srcCsp == dstCsp is supported at present + if (srcCsp != dstCsp) + { + x265_log(NULL, X265_LOG_ERROR, "wrong, source csp != destination csp \n"); + return false; + } + + lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW; + lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH; + crXInc = (((int64_t)m_crSrcW << 16) + (m_crDstW >> 1)) / m_crDstW; + crYInc = (((int64_t)m_crSrcH << 16) + (m_crDstH >> 1)) / m_crDstH; + + const int filterAlign = 1; + + // init horizontal Luma Scaler filter + m_ScalerFilters[0] = new ScalerHLumFilter(m_bitDepth); + m_ScalerFilters[0]->initCoeff(m_algorithmFlags, lumXInc, srcW, dstW, filterAlign, 1 << 14, getLocalPos(0, 0), getLocalPos(0, 0)); + + // init horizontal cr Scaler filter + m_ScalerFilters[1] = new ScalerHCrFilter(m_bitDepth); + m_ScalerFilters[1]->initCoeff(m_algorithmFlags, crXInc, m_crSrcW, m_crDstW, filterAlign, 1 << 14, + getLocalPos(m_crSrcHSubSample, srcHCrPos), getLocalPos(m_crDstHSubSample, dstHCrPos)); + + // init vertical Luma scaler filter + m_ScalerFilters[2] = new ScalerVLumFilter(m_bitDepth); + m_ScalerFilters[2]->initCoeff(m_algorithmFlags, lumYInc, srcH, dstH, filterAlign, 1 << 12, getLocalPos(0, 0), getLocalPos(0, 0)); + + // init vertical cr scaler filter + m_ScalerFilters[3] = new ScalerVCrFilter(m_bitDepth); + m_ScalerFilters[3]->initCoeff(m_algorithmFlags, crYInc, m_crSrcH, m_crDstH, filterAlign, 1 << 12, + getLocalPos(m_crSrcVSubSample, srcVCrPos), getLocalPos(m_crDstVSubSample, dstVCrPos)); + + // init slice, must after filter initialization + initScalerSlice(); + + // set slice + m_ScalerFilters[0]->setSlice(m_slices[0], m_slices[1]); + m_ScalerFilters[1]->setSlice(m_slices[0], m_slices[1]); + + m_ScalerFilters[2]->setSlice(m_slices[1], m_slices[2]); + m_ScalerFilters[3]->setSlice(m_slices[1], m_slices[2]); + + return 0; +} + +void HFilterScaler8Bit::doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) +{ + int IdxW = FACTOR_4; + int IdxF = FIL_DEF; + + /* This is hard to read code, but much faster. Speed is crucial here */ + (dstW % 8 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 11) && (IdxF = FIL_11) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 10) && (IdxF = FIL_10) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 9) && (IdxF = FIL_9) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 15) && (IdxF = FIL_15) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 13) && (IdxF = FIL_13) && (IdxW = FACTOR_8); + + /* Do not check multiple of width 4, if width is already multiple of 8 */ + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4); + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4); + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_4); + + (dstW % 4 == 0) && (filterSize == 24) && (IdxF = FIL_24) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 22) && (IdxF = FIL_22) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 19) && (IdxF = FIL_19) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 17) && (IdxF = FIL_17) && (IdxW = FACTOR_4); + +#if X265_DEPTH == 8 + doScaling_c(dst, dstW, src, filter, filterPos, filterSize); +#else + doScaling_c_h(dst, dstW, src, filter, filterPos, filterSize); +#endif +} + +void HFilterScaler10Bit::doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) +{ + int IdxW = FACTOR_4; + int IdxF = FIL_DEF; + + /* This is hard to read code, but much faster. Speed is crucial here */ + (dstW % 8 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 11) && (IdxF = FIL_11) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 10) && (IdxF = FIL_10) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 9) && (IdxF = FIL_9) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 15) && (IdxF = FIL_15) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 13) && (IdxF = FIL_13) && (IdxW = FACTOR_8); + + /* Do not check multiple of width 4, if width is already multiple of 8 */ + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4); + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4); + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_4); + + (dstW % 4 == 0) && (filterSize == 24) && (IdxF = FIL_24) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 22) && (IdxF = FIL_22) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 19) && (IdxF = FIL_19) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 17) && (IdxF = FIL_17) && (IdxW = FACTOR_4); + +#if X265_DEPTH == 8 + doScaling_c(dst, dstW, src, filter, filterPos, filterSize); +#else + doScaling_c_h(dst, dstW, src, filter, filterPos, filterSize); +#endif +} + +int ScalerFilterManager::scale_pic(void ** src, void ** dst, int * srcStride, int * dstStride) +{ + uint8_t** src_8bit, **dst_8bit; + src_8bit = (uint8_t**)src; + dst_8bit = (uint8_t**)dst; + if (!src_8bit || !dst_8bit) + return -1; + + const int srcsliceHor = m_srcH; + const int dstW = m_dstW; + const int dstH = m_dstH; + int32_t *vLumFilterPos = m_ScalerFilters[2]->m_filtPos; + int32_t *vCrFilterPos = m_ScalerFilters[3]->m_filtPos; + const int vLumFilterSize = m_ScalerFilters[2]->m_filtLen; + const int vCrFilterSize = m_ScalerFilters[3]->m_filtLen; + const int crSrcsliceHor = UH_CEIL_SHIFTR(srcsliceHor, m_crSrcVSubSample); + + // vars which will change and which we need to store back in the context + int lumBufIndex = -1; + int crBufIndex = -1; + int lastInLumBuf = -1; + int lastInCrBuf = -1; + + int hasLumHoles = 1; + int hasCrHoles = 1; + + ScalerSlice *src_slice = m_slices[0]; + ScalerSlice *hout_slice = m_slices[1]; + ScalerSlice *vout_slice = m_slices[2]; + src_slice->initFromSrc((uint8_t**)src, srcStride, m_srcW, 0, srcsliceHor, 0, crSrcsliceHor, 1); + vout_slice->initFromSrc((uint8_t**)dst, dstStride, m_dstW, 0, dstH, 0, UH_CEIL_SHIFTR(dstH, m_crDstVSubSample), 0); + + hout_slice->m_plane[0].sliceVer = 0; + hout_slice->m_plane[1].sliceVer = 0; + hout_slice->m_plane[2].sliceVer = 0; + hout_slice->m_plane[3].sliceVer = 0; + hout_slice->m_plane[0].sliceHor = 0; + hout_slice->m_plane[1].sliceHor = 0; + hout_slice->m_plane[2].sliceHor = 0; + hout_slice->m_plane[3].sliceHor = 0; + hout_slice->m_width = dstW; + + for (int dstY = 0; dstY < dstH; dstY++) + { + const int crDstY = dstY >> m_crDstVSubSample; + const int firstLumSrcY = x265_max(1 - vLumFilterSize, vLumFilterPos[dstY]); + const int firstLumSrcY2 = x265_max(1 - vLumFilterSize, vLumFilterPos[x265_min(dstY | ((1 << m_crDstVSubSample) - 1), dstH - 1)]); + const int firstCrSrcY = x265_max(1 - vCrFilterSize, vCrFilterPos[crDstY]); + + int lastLumSrcY = x265_min(m_srcH, firstLumSrcY + vLumFilterSize) - 1; + int lastLumSrcY2 = x265_min(m_srcH, firstLumSrcY2 + vLumFilterSize) - 1; + int lastCrSrcY = x265_min(m_crSrcH, firstCrSrcY + vCrFilterSize) - 1; + + // handle holes + if (firstLumSrcY > lastInLumBuf) + { + hasLumHoles = lastInLumBuf != firstLumSrcY - 1; + if (hasLumHoles) + { + hout_slice->m_plane[0].sliceVer = firstLumSrcY; + hout_slice->m_plane[3].sliceVer = firstLumSrcY; + hout_slice->m_plane[0].sliceHor = + hout_slice->m_plane[3].sliceHor = 0; + } + + lastInLumBuf = firstLumSrcY - 1; + } + if (firstCrSrcY > lastInCrBuf) + { + hasCrHoles = lastInCrBuf != firstCrSrcY - 1; + if (hasCrHoles) + { + hout_slice->m_plane[1].sliceVer = firstCrSrcY; + hout_slice->m_plane[2].sliceVer = firstCrSrcY; + hout_slice->m_plane[1].sliceHor = + hout_slice->m_plane[2].sliceHor = 0; + } + + lastInCrBuf = firstCrSrcY - 1; + } + + // Do we have enough lines in this slice to output the dstY line + int enoughLines = lastLumSrcY2 < 0 + srcsliceHor && lastCrSrcY < UH_CEIL_SHIFTR(0 + srcsliceHor, m_crSrcVSubSample); + if (!enoughLines) + { + lastLumSrcY = 0 + srcsliceHor - 1; + lastCrSrcY = 0 + crSrcsliceHor - 1; + x265_log(NULL, X265_LOG_INFO, "buffering slice: lastLumSrcY %d lastCrSrcY %d\n", lastLumSrcY, lastCrSrcY); + } + + X265_CHECK(((lastLumSrcY - firstLumSrcY + 1) <= hout_slice->m_plane[0].availLines), "invalid value %d", lastLumSrcY - firstLumSrcY + 1); + X265_CHECK((lastCrSrcY - firstCrSrcY + 1) <= hout_slice->m_plane[1].availLines, "invalid value %d", lastCrSrcY - firstCrSrcY + 1); + + int firstPosY, lastPosY, firstCPosY, lastCPosY; + int posY = hout_slice->m_plane[0].sliceVer + hout_slice->m_plane[0].sliceHor; + if (posY <= lastLumSrcY && !hasLumHoles) + { + firstPosY = x265_max(firstLumSrcY, posY); + lastPosY = x265_min(firstLumSrcY + hout_slice->m_plane[0].availLines - 1, 0 + srcsliceHor - 1); + } + else + { + firstPosY = posY; + lastPosY = lastLumSrcY; + } + + int cPosY = hout_slice->m_plane[1].sliceVer + hout_slice->m_plane[1].sliceHor; + if (cPosY <= lastCrSrcY && !hasCrHoles) + { + firstCPosY = x265_max(firstCrSrcY, cPosY); + lastCPosY = x265_min(firstCrSrcY + hout_slice->m_plane[1].availLines - 1, UH_CEIL_SHIFTR(0 + srcsliceHor, m_crSrcVSubSample) - 1); + } + else + { + firstCPosY = cPosY; + lastCPosY = lastCrSrcY; + } + + hout_slice->rotate(lastPosY, lastCPosY); + // horizontal luma scale + if (posY < lastLumSrcY + 1) + m_ScalerFilters[0]->process(firstPosY, lastPosY - firstPosY + 1); + + lumBufIndex += lastLumSrcY - lastInLumBuf; + lastInLumBuf = lastLumSrcY; + // horizontal chroma Scale + if (cPosY < lastCrSrcY + 1) + m_ScalerFilters[1]->process(firstCPosY, lastCPosY - firstCPosY + 1); + + crBufIndex += lastCrSrcY - lastInCrBuf; + lastInCrBuf = lastCrSrcY; + + // wrap buf index around to stay inside the ring buffer + if (lumBufIndex >= vLumFilterSize) + lumBufIndex -= vLumFilterSize; + if (crBufIndex >= vCrFilterSize) + crBufIndex -= vCrFilterSize; + if (!enoughLines) + break; // we can't output a dstY line so let's try with the next slice + + // vertical scale(output converter) + for (int i = 2; i < m_numFilter; ++i) + m_ScalerFilters[i]->process(dstY, 1); + } + return 0; +} + +void ScalerFilterManager::getMinBufferSize(int *out_lum_size, int *out_cr_size) +{ + int lumY; + int dstH = m_dstH; + int crDstH = m_crDstH; + int *lumFilterPos = m_ScalerFilters[2]->m_filtPos; + int *crFilterPos = m_ScalerFilters[3]->m_filtPos; + int lumFilterSize = m_ScalerFilters[2]->m_filtLen; + int crFilterSize = m_ScalerFilters[3]->m_filtLen; + int crSubSample = m_crSrcVSubSample; + + *out_lum_size = lumFilterSize; + *out_cr_size = crFilterSize; + + for (lumY = 0; lumY < dstH; lumY++) + { + int crY = (int64_t)lumY * crDstH / dstH; + int nextSlice = x265_max(lumFilterPos[lumY] + lumFilterSize - 1, ((crFilterPos[crY] + crFilterSize - 1) << crSubSample)); + + nextSlice >>= crSubSample; + nextSlice <<= crSubSample; + (*out_lum_size) = x265_max((*out_lum_size), nextSlice - lumFilterPos[lumY]); + (*out_cr_size) = x265_max((*out_cr_size), (nextSlice >> crSubSample) - crFilterPos[crY]); + } +} + +int ScalerFilterManager::initScalerSlice() +{ + int ret = 0; + int dst_stride = SCALER_ALIGN(m_dstW * sizeof(int16_t) + 66, 16); + if (m_bitDepth == 16) + dst_stride <<= 1; + + int lumBufSize; + int crBufSize; + int vLumFilterSize = m_ScalerFilters[2]->m_filtLen; // Vertical filter size for luma pixels. + int vCrFilterSize = m_ScalerFilters[3]->m_filtLen; // Vertical filter size for chroma pixels. + getMinBufferSize(&lumBufSize, &crBufSize); + lumBufSize = X265_MAX(lumBufSize, vLumFilterSize + MAX_NUM_LINES_AHEAD); + crBufSize = X265_MAX(crBufSize, vCrFilterSize + MAX_NUM_LINES_AHEAD); + + for (int i = 0; i < m_numSlice; i++) + m_slices[i] = new ScalerSlice; + ret = m_slices[0]->create(m_srcH, m_crSrcH, m_crSrcHSubSample, m_crSrcVSubSample, 0); + if (ret < 0) + { + x265_log(NULL, X265_LOG_ERROR, "alloc_slice m_slice[0] failed\n"); + return -1; + } + + // horizontal scaler output + ret = m_slices[1]->create(lumBufSize, crBufSize, m_crDstHSubSample, m_crDstVSubSample, 1); + if (ret < 0) + { + x265_log(NULL, X265_LOG_ERROR, "m_slice[1].create failed\n"); + return -1; + } + ret = m_slices[1]->createLines(dst_stride, m_dstW); + if (ret < 0) + { + x265_log(NULL, X265_LOG_ERROR, "m_slice[1].createLines failed\n"); + return -1; + } + + m_slices[1]->fillOnes(dst_stride >> 1, m_bitDepth == 16); + + // vertical scaler output + ret = m_slices[2]->create(m_dstH, m_crDstH, m_crDstHSubSample, m_crDstVSubSample, 0); + if (ret < 0) + { + x265_log(NULL, X265_LOG_ERROR, "m_slice[2].create failed\n"); + return -1; + } + + return 0; +} + +int ScalerFilterManager::getLocalPos(int crSubSample, int pos) +{ + if (pos == -1 || pos <= -513) + pos = (128 << crSubSample) - 128; + pos += 128; // relative to ideal left edge + return pos >> crSubSample; +} + +ScalerSlice::ScalerSlice() : + m_width(0), + m_hCrSubSample(0), + m_vCrSubSample(0), + m_isRing(0), + m_destroyLines(0) +{ + for (int i = 0; i < m_numSlicePlane; i++) + { + m_plane[i].availLines = 0; + m_plane[i].sliceVer = 0; + m_plane[i].sliceHor = 0; + m_plane[i].lineBuf = NULL; + } +} + +void ScalerSlice::destroy() +{ + if (m_destroyLines) + destroyLines(); + for (int i = 0; i < m_numSlicePlane; i++) + { + if (m_plane[i].lineBuf) + X265_FREE(m_plane[i].lineBuf); + } +} + +int ScalerSlice::create(int lumLines, int crLines, int h_sub_sample, int v_sub_sample, int ring) +{ + int i; + int size[4] = { lumLines, crLines, crLines, lumLines }; + + m_hCrSubSample = h_sub_sample; + m_vCrSubSample = v_sub_sample; + m_isRing = ring; + m_destroyLines = 0; + + for (i = 0; i < m_numSlicePlane; ++i) + { + int n = size[i] * (ring == 0 ? 1 : 3); + m_plane[i].lineBuf = X265_MALLOC(uint8_t*, n); + if (!m_plane[i].lineBuf) + return -1; + + m_plane[i].availLines = size[i]; + m_plane[i].sliceVer = 0; + m_plane[i].sliceHor = 0; + } + return 0; +} + +/* +slice lines contains extra bytes for vectorial code thus @size +is the allocated memory size and @width is the number of pixels +*/ +int ScalerSlice::createLines(int size, int width) +{ + int i; + int idx[2] = { 3, 2 }; + + m_destroyLines = 1; + m_width = width; + + for (i = 0; i < 2; ++i) { + int n = m_plane[i].availLines; + int j; + int ii = idx[i]; + assert(n == m_plane[ii].availLines); + for (j = 0; j < n; ++j) + { + // chroma plane line U and V are expected to be contiguous in memory + m_plane[i].lineBuf[j] = (uint8_t*)X265_MALLOC(uint8_t, size * 2 + 32); + if (!m_plane[i].lineBuf[j]) + { + destroyLines(); + return -1; + } + m_plane[ii].lineBuf[j] = m_plane[i].lineBuf[j] + size + 16; + if (m_isRing) + { + m_plane[i].lineBuf[j + n] = m_plane[i].lineBuf[j]; + m_plane[ii].lineBuf[j + n] = m_plane[ii].lineBuf[j]; + } + } + } + + return 0; +} + +void ScalerSlice::destroyLines() +{ + int i; + for (i = 0; i < 2; ++i) + { + int n = m_plane[i].availLines; + int j; + for (j = 0; j < n; ++j) + { + X265_FREE(m_plane[i].lineBuf[j]); + m_plane[i].lineBuf[j] = NULL; + if (m_isRing) + m_plane[i].lineBuf[j + n] = NULL; + } + } + + for (i = 0; i < m_numSlicePlane; ++i) + memset(m_plane[i].lineBuf, 0, sizeof(uint8_t*) * m_plane[i].availLines * (m_isRing ? 3 : 1)); + m_destroyLines = 0; +} + +void ScalerSlice::fillOnes(int n, int is16bit) +{ + int i; + for (i = 0; i < m_numSlicePlane; ++i) + { + int j; + int size = m_plane[i].availLines; + for (j = 0; j < size; ++j) + { + int k; + int end = is16bit ? n >> 1 : n; + // fill also one extra element + end += 1; + if (is16bit) + for (k = 0; k < end; ++k) + ((int32_t*)(m_plane[i].lineBuf[j]))[k] = 1 << 18; + else + for (k = 0; k < end; ++k) + ((int16_t*)(m_plane[i].lineBuf[j]))[k] = 1 << 14; + } + } +} + +int ScalerSlice::rotate(int lum, int cr) +{ + int i; + if (lum) + { + for (i = 0; i < m_numSlicePlane; i += 3) + { + int n = m_plane[i].availLines; + int l = lum - m_plane[i].sliceVer; + + if (l >= n * 2) + { + m_plane[i].sliceVer += n; + m_plane[i].sliceHor -= n; + } + } + } + if (cr) + { + for (i = 1; i < 3; ++i) + { + int n = m_plane[i].availLines; + int l = cr - m_plane[i].sliceVer; + + if (l >= n * 2) + { + m_plane[i].sliceVer += n; + m_plane[i].sliceHor -= n; + } + } + } + return 0; +} + +int ScalerSlice::initFromSrc(uint8_t *src[4], const int stride[4], int srcW, int lumY, int lumH, int crY, int crH, int relative) +{ + int i = 0; + + const int start[m_numSlicePlane] = { lumY, crY, crY, lumY }; + + const int end[m_numSlicePlane] = { lumY + lumH, crY + crH, crY + crH, lumY + lumH }; + + uint8_t *const src_[m_numSlicePlane] = { src[0] + (relative ? 0 : start[0]) * stride[0], + src[1] + (relative ? 0 : start[1]) * stride[1], + src[2] + (relative ? 0 : start[2]) * stride[2], + src[3] + (relative ? 0 : start[3]) * stride[3] }; + + m_width = srcW; + + for (i = 0; i < m_numSlicePlane; ++i) + { + int j; + int first = m_plane[i].sliceVer; + int n = m_plane[i].availLines; + int lines = end[i] - start[i]; + int tot_lines = end[i] - first; + + if (start[i] >= first && n >= tot_lines) + { + m_plane[i].sliceHor = x265_max(tot_lines, m_plane[i].sliceHor); + for (j = 0; j < lines; j += 1) + m_plane[i].lineBuf[start[i] - first + j] = src_[i] + j * stride[i]; + } + else + { + m_plane[i].sliceVer = start[i]; + lines = lines > n ? n : lines; + m_plane[i].sliceHor = lines; + for (j = 0; j < lines; j += 1) + m_plane[i].lineBuf[j] = src_[i] + j * stride[i]; + } + } + return 0; +} +} diff -r 00b686782ad0 -r 96739081845a source/common/scaler.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/scaler.h Mon Aug 26 15:34:49 2019 +0530 @@ -0,0 +1,254 @@ +/***************************************************************************** + * Copyright (C) 2020 MulticoreWare, Inc + * + * Authors: Pooja Venkatesan <po...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_SCALER_H +#define X265_SCALER_H + +#include "common.h" + +namespace X265_NS { +//x265 private namespace + +class ScalerSlice; +class VideoDesc; + +#define MAX_NUM_LINES_AHEAD 4 +#define SCALER_ALIGN(x, j) (((x)+(j)-1)&~((j)-1)) +#define X265_ABS(j) ((j) >= 0 ? (j) : (-(j))) +#define SCALER_MAX_REDUCE_CUTOFF 0.002 +#define SCALER_BITEXACT 0x80000 +#define ROUNDED_DIVISION(i,j) (((i)>0 ? (i) + ((j)>>1) : (i) - ((j)>>1))/(j)) +#define UH_CEIL_SHIFTR(i,j) (!scale_builtin_constant_p(j) ? -((-(i)) >> (j)) \ + : ((i) + (1<<(j)) - 1) >> (j)) + +#if defined(__GNUC__) || defined(__clang__) +# define scale_builtin_constant_p __builtin_constant_p +#else +# define scale_builtin_constant_p(x) 0 +#endif + +enum ResFactor +{ + RES_FACTOR_64, RES_FACTOR_32, RES_FACTOR_16, RES_FACTOR_8, + RES_FACTOR_4, RES_FACTOR_DEF, NUM_RES_FACTOR +}; + +enum ScalerFactor +{ + FACTOR_4, FACTOR_8, NUM_FACTOR +}; + +enum FilterSize +{ + FIL_4, FIL_6, FIL_8, FIL_9, FIL_10, FIL_11, FIL_13, FIL_15, + FIL_16, FIL_17, FIL_19, FIL_22, FIL_24, FIL_DEF, NUM_FIL +}; + +class ScalerFilter { +public: + int m_filtLen; + int32_t* m_filtPos; // Array of horizontal/vertical starting pos for each dst for luma / chroma planes. + int16_t* m_filt; // Array of horizontal/vertical filter coefficients for luma / chroma planes. + ScalerSlice* m_sourceSlice; // Source slice + ScalerSlice* m_destSlice; // Output slice + ScalerFilter(); + virtual ~ScalerFilter(); + virtual void process(int sliceVer, int sliceHor) = 0; + int initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos); + void setSlice(ScalerSlice* source, ScalerSlice* dest) { m_sourceSlice = source; m_destSlice = dest; } +}; + +class VideoDesc { +public: + int m_width; + int m_height; + int m_csp; + int m_inputDepth; + + VideoDesc(int w, int h, int csp, int bitDepth) + { + m_width = w; + m_height = h; + m_csp = csp; + m_inputDepth = bitDepth; + } +}; + +typedef struct ScalerPlane +{ + int availLines; // max number of lines that can be held by this plane + int sliceVer; // index of first line + int sliceHor; // number of lines + uint8_t** lineBuf; // line buffer +} ScalerPlane; + +// Assist horizontal filtering, base class +class HFilterScaler { +public: + int m_bitDepth; +public: + HFilterScaler() :m_bitDepth(0) {}; + virtual ~HFilterScaler() {}; + virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) = 0; +}; + +// Assist vertical filtering, base class +class VFilterScaler { +public: + int m_bitDepth; +public: + VFilterScaler() :m_bitDepth(0) {}; + virtual ~VFilterScaler() {}; + virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) = 0; +}; + +// Assist horizontal filtering, process 8 bit case +class HFilterScaler8Bit : public HFilterScaler { +public: + HFilterScaler8Bit() { m_bitDepth = 8; } + virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); +}; + +// Assist horizontal filtering, process 10 bit case +class HFilterScaler10Bit : public HFilterScaler { +public: + HFilterScaler10Bit() { m_bitDepth = 10; } + virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); +}; + +// Assist vertical filtering, process 8 bit case +class VFilterScaler8Bit : public VFilterScaler { +public: + VFilterScaler8Bit() { m_bitDepth = 8; } + virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW); +}; + +// Assist vertical filtering, process 10 bit case +class VFilterScaler10Bit : public VFilterScaler { +public: + VFilterScaler10Bit() { m_bitDepth = 10; } + virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW); +}; + +// Horizontal filter for luma +class ScalerHLumFilter : public ScalerFilter { +private: + HFilterScaler* m_hFilterScaler; +public: + ScalerHLumFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;} + ~ScalerHLumFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); } + virtual void process(int sliceVer, int sliceHor); +}; + +// Horizontal filter for chroma +class ScalerHCrFilter : public ScalerFilter { +private: + HFilterScaler* m_hFilterScaler; +public: + ScalerHCrFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;} + ~ScalerHCrFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); } + virtual void process(int sliceVer, int sliceHor); +}; + +// Vertical filter for luma +class ScalerVLumFilter : public ScalerFilter { +private: + VFilterScaler* m_vFilterScaler; +public: + ScalerVLumFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;} + ~ScalerVLumFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); } + virtual void process(int sliceVer, int sliceHor); +}; + +// Vertical filter for chroma +class ScalerVCrFilter : public ScalerFilter { +private: + VFilterScaler* m_vFilterScaler; +public: + ScalerVCrFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;} + ~ScalerVCrFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); } + virtual void process(int sliceVer, int sliceHor); +}; + +class ScalerSlice +{ +private: + enum ScalerSlicePlaneNum { m_numSlicePlane = 4 }; +public: + int m_width; // Slice line width + int m_hCrSubSample; // horizontal Chroma subsampling factor + int m_vCrSubSample; // vertical chroma subsampling factor + int m_isRing; // flag to identify if this ScalerSlice is a ring buffer + int m_destroyLines; // flag to identify if there are dynamic allocated lines + ScalerPlane m_plane[m_numSlicePlane]; +public: + ScalerSlice(); + ~ScalerSlice() { destroy(); } + int rotate(int lum, int cr); + void fillOnes(int n, int is16bit); + int create(int lumLines, int crLines, int h_sub_sample, int v_sub_sample, int ring); + int createLines(int size, int width); + void destroyLines(); + void destroy(); + int initFromSrc(uint8_t *src[4], const int stride[4], int srcW, int lumY, int lumH, int crY, int crH, int relative); +}; + +class ScalerFilterManager { +private: + enum ScalerFilterNum { m_numSlice = 3, m_numFilter = 4 }; + +private: + int m_bitDepth; + int m_algorithmFlags; // 1, bilinear; 4 bicubic, default is bicubic + int m_srcW; // Width of source luma planes. + int m_srcH; // Height of source luma planes. + int m_dstW; // Width of dest luma planes. + int m_dstH; // Height of dest luma planes. + int m_crSrcW; // Width of source chroma planes. + int m_crSrcH; // Height of source chroma planes. + int m_crDstW; // Width of dest chroma planes. + int m_crDstH; // Height of dest chroma planes. + int m_crSrcHSubSample; // Binary log of horizontal subsampling factor between Y and Cr planes in src image. + int m_crSrcVSubSample; // Binary log of vertical subsampling factor between Y and Cr planes in src image. + int m_crDstHSubSample; // Binary log of horizontal subsampling factor between Y and Cr planes in dest image. + int m_crDstVSubSample; // Binary log of vertical subsampling factor between Y and Cr planes in dest image. + ScalerSlice* m_slices[m_numSlice]; + ScalerFilter* m_ScalerFilters[m_numFilter]; +private: + int getLocalPos(int crSubSample, int pos); + void getMinBufferSize(int *out_lum_size, int *out_cr_size); + int initScalerSlice(); +public: + ScalerFilterManager(); + ~ScalerFilterManager() { + for (int i = 0; i < m_numSlice; i++) + if (m_slices[i]) { m_slices[i]->destroy(); delete m_slices[i]; m_slices[i] = NULL; } + for (int i = 0; i < m_numFilter; i++) + if (m_ScalerFilters[i]) { delete m_ScalerFilters[i]; m_ScalerFilters[i] = NULL; } + } + int init(int algorithmFlags, VideoDesc* srcVideoDesc, VideoDesc* dstVideoDesc); + int scale_pic(void** src, void** dst, int* srcStride, int* dstStride); +}; +} + +#endif //ifndef X265_SCALER_H
# HG changeset patch # User Pooja Venkatesan <po...@multicorewareinc.com> # Date 1566813889 -19800 # Mon Aug 26 15:34:49 2019 +0530 # Node ID 96739081845ad6186aee54c61f69de1f30abff33 # Parent 00b686782ad0c60dff48a1a001417814ce9a20dd Scale pictures using bicubic or bilinear algorithm. diff -r 00b686782ad0 -r 96739081845a source/common/CMakeLists.txt --- a/source/common/CMakeLists.txt Thu Feb 13 11:57:33 2020 +0530 +++ b/source/common/CMakeLists.txt Mon Aug 26 15:34:49 2019 +0530 @@ -168,4 +168,5 @@ predict.cpp predict.h scalinglist.cpp scalinglist.h quant.cpp quant.h contexts.h - deblock.cpp deblock.h) + deblock.cpp deblock.h + scaler.cpp scaler.h) diff -r 00b686782ad0 -r 96739081845a source/common/scaler.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/scaler.cpp Mon Aug 26 15:34:49 2019 +0530 @@ -0,0 +1,1087 @@ +#include "scaler.h" + +#if _MSC_VER +#pragma warning(disable: 4706) // assignment within conditional +#pragma warning(disable: 4244) // '=' : possible loss of data +#endif + +#define SHORT_MIN (-(1 << 15)) +#define SHORT_MAX ((1 << 15) - 1) +#define SHORT_MAX_10 ((1 << 10) - 1) + +namespace X265_NS{ + +ScalerFilterManager::ScalerFilterManager() : + m_bitDepth(0), + m_algorithmFlags(0), + m_srcW(0), + m_srcH(0), + m_dstW(0), + m_dstH(0), + m_crSrcW(0), + m_crSrcH(0), + m_crDstW(0), + m_crDstH(0), + m_crSrcHSubSample(0), + m_crSrcVSubSample(0), + m_crDstHSubSample(0), + m_crDstVSubSample(0) +{ + for (int i = 0; i < m_numSlice; i++) + m_slices[i] = NULL; + for (int i = 0; i < m_numFilter; i++) + m_ScalerFilters[i] = NULL; +} + +inline static void filter_copy_c(int64_t* filter, int64_t* filter2, int size) +{ + for (int i = 0; i < size; i++) + filter2[i] = filter[i]; +} + +#if X265_DEPTH == 8 +static void doScaling_c(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) +{ + for (int i = 0; i < dstW; i++) + { + int val = 0; + int sourcePos = filterPos[i]; + for (int j = 0; j < filterSize; j++) + val += ((int)src[sourcePos + j]) * filter[filterSize * i + j]; + // the cubic equation does overflow ... + dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 7); + } +} +static uint8_t clipUint8(int a) +{ + if (a&(~0xFF)) + return (-a) >> 31; + else + return a; +} + +static void yuv2PlaneX_c(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) +{ + for (int i = 0; i < dstW; i++) + { + int val = 64 << 12; + for (int j = 0; j < filterSize; j++) + val += src[j][i] * filter[j]; + dest[i] = clipUint8(val >> 19); + } +} +#else +static void yuv2PlaneX_c_h(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) +{ + for (int i = 0; i < dstW; i++) + { + int val = 1 << 16; + uint16_t* dst16bit = (uint16_t *)dest; + for (int j = 0; j < filterSize; j++) + val += src[j][i] * filter[j]; + uint16_t d = x265_clip3(0, SHORT_MAX_10, val >> 17); + ((uint8_t*)(&dst16bit[i]))[0] = (d); + ((uint8_t*)(&dst16bit[i]))[1] = (d) >> 8; + } +} +static void doScaling_c_h(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) +{ + const uint16_t *srcLocal = (const uint16_t *)src; + for (int i = 0; i < dstW; i++) + { + int val = 0; + int sourcePos = filterPos[i]; + for (int j = 0; j < filterSize; j++) + val += ((int)srcLocal[sourcePos + j]) * filter[filterSize * i + j]; + // the cubic equation does overflow + dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 9); + } +} +#endif + +ScalerFilter::ScalerFilter() : + m_filtLen(0), + m_filtPos(NULL), + m_filt(NULL), + m_sourceSlice(NULL), + m_destSlice(NULL) +{ +} + +ScalerFilter::~ScalerFilter() +{ + if (m_filtPos) { + delete[] m_filtPos; m_filtPos = NULL; + } + if (m_filt) { + delete[] m_filt; m_filt = NULL; + } +} + +void ScalerHLumFilter::process(int sliceVer, int sliceHor) +{ + uint8_t ** src = m_sourceSlice->m_plane[0].lineBuf; + uint8_t ** dst = m_destSlice->m_plane[0].lineBuf; + int sourcePos = sliceVer - m_sourceSlice->m_plane[0].sliceVer; + int destPos = sliceVer - m_destSlice->m_plane[0].sliceVer; + int dstW = m_destSlice->m_width; + for (int i = 0; i < sliceHor; ++i) + { + m_hFilterScaler->doScaling((int16_t*)dst[destPos + i], dstW, (const uint8_t *)src[sourcePos + i], m_filt, m_filtPos, m_filtLen); + m_destSlice->m_plane[0].sliceHor += 1; + } +} + +void ScalerHCrFilter::process(int sliceVer, int sliceHor) +{ + uint8_t ** src1 = m_sourceSlice->m_plane[1].lineBuf; + uint8_t ** dst1 = m_destSlice->m_plane[1].lineBuf; + uint8_t ** src2 = m_sourceSlice->m_plane[2].lineBuf; + uint8_t ** dst2 = m_destSlice->m_plane[2].lineBuf; + + int sourcePos1 = sliceVer - m_sourceSlice->m_plane[1].sliceVer; + int destPos1 = sliceVer - m_destSlice->m_plane[1].sliceVer; + int sourcePos2 = sliceVer - m_sourceSlice->m_plane[2].sliceVer; + int destPos2 = sliceVer - m_destSlice->m_plane[2].sliceVer; + + int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample; + + for (int i = 0; i < sliceHor; ++i) + { + m_hFilterScaler->doScaling((int16_t*)dst1[destPos1 + i], dstW, src1[sourcePos1 + i], m_filt, m_filtPos, m_filtLen); + m_hFilterScaler->doScaling((int16_t*)dst2[destPos2 + i], dstW, src2[sourcePos2 + i], m_filt, m_filtPos, m_filtLen); + m_destSlice->m_plane[1].sliceHor += 1; + m_destSlice->m_plane[2].sliceHor += 1; + } +} + +void VFilterScaler8Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) +{ + int IdxW = FACTOR_4; + int IdxF = FIL_DEF; + + (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4); + +#if X265_DEPTH == 8 + yuv2PlaneX_c(filter, filterSize, src, dest, dstW); +#else + yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW); +#endif +} + +void VFilterScaler10Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) +{ + int IdxW = FACTOR_4; + int IdxF = FIL_DEF; + + (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4); + +#if X265_DEPTH == 8 + yuv2PlaneX_c(filter, filterSize, src, dest, dstW); +#else + yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW); +#endif +} + +void ScalerVLumFilter::process(int sliceVer, int sliceHor) +{ + (void)sliceHor; + int first = X265_MAX(1 - m_filtLen, m_filtPos[sliceVer]); + int sp = first - m_sourceSlice->m_plane[0].sliceVer; + int dp = sliceVer - m_destSlice->m_plane[0].sliceVer; + uint8_t **src = m_sourceSlice->m_plane[0].lineBuf + sp; + uint8_t **dst = m_destSlice->m_plane[0].lineBuf + dp; + int16_t *filter = m_filt + (sliceVer * m_filtLen); + int dstW = m_destSlice->m_width; + m_vFilterScaler->yuv2PlaneX(filter, m_filtLen, (const int16_t**)src, dst[0], dstW); +} + +void ScalerVCrFilter::process(int sliceVer, int sliceHor) +{ + (void)sliceHor; + + const int crSkipMask = (1 << m_destSlice->m_vCrSubSample) - 1; + if (sliceVer & crSkipMask) + return; + else + { + int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample; + int crSliceVer = sliceVer >> m_destSlice->m_vCrSubSample; + int first = X265_MAX(1 - m_filtLen, m_filtPos[crSliceVer]); + int sp1 = first - m_sourceSlice->m_plane[1].sliceVer; + int sp2 = first - m_sourceSlice->m_plane[2].sliceVer; + int dp1 = crSliceVer - m_destSlice->m_plane[1].sliceVer; + int dp2 = crSliceVer - m_destSlice->m_plane[2].sliceVer; + uint8_t **src1 = m_sourceSlice->m_plane[1].lineBuf + sp1; + uint8_t **src2 = m_sourceSlice->m_plane[2].lineBuf + sp2; + uint8_t **dst1 = m_destSlice->m_plane[1].lineBuf + dp1; + uint8_t **dst2 = m_destSlice->m_plane[2].lineBuf + dp2; + int16_t *filter = m_filt + (crSliceVer * m_filtLen); + + m_vFilterScaler->yuv2PlaneX((int16_t*)filter, m_filtLen, (const int16_t**)src1, dst1[0], dstW); + m_vFilterScaler->yuv2PlaneX((int16_t*)filter, m_filtLen, (const int16_t**)src2, dst2[0], dstW); + } +} + +int ScalerFilter::initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos) +{ + int filterSize; + int filter2Size; + int minFilterSize; + int64_t *filter = NULL; + int64_t *filter2 = NULL; + const int64_t fone = 1LL << (54 - x265_min((int)X265_LOG2(srcW / dstW), 8)); + int *outFilterSize = &m_filtLen; + int64_t xDstInSrc; + int sizeFactor = flag; + + // Init filter pos, the +3 is for the MMX(+1) / SSE(+3) scaler which reads over the end + m_filtPos = new int32_t[dstW + 3]; + int32_t **filterPos = &m_filtPos; + + if (inc <= 1 << 16) + filterSize = 1 + sizeFactor; // upscale + else + filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW; + + filterSize = x265_min(filterSize, srcW - 2); + filterSize = x265_max(filterSize, 1); + filter = new int64_t[dstW * sizeof(*filter) * filterSize]; + + xDstInSrc = ((destPos*(int64_t)inc) >> 7) - ((sourcePos * 0x10000LL) >> 7); + for (int i = 0; i < dstW; i++) + { + int xx = (xDstInSrc - (filterSize - 2) * (1LL << 16)) / (1 << 17); + (*filterPos)[i] = xx; + for (int j = 0; j < filterSize; j++) + { + int64_t d = (X265_ABS(((int64_t)xx * (1 << 17)) - xDstInSrc)) << 13; + int64_t coeff = 0; + + if (inc > 1 << 16) + d = d * dstW / srcW; + + if (flag == 4) // BiCUBIC + { + int64_t B = (0) * (1 << 24); + int64_t C = (0.6) * (1 << 24); + + if (d >= 1LL << 31) + coeff = 0.0; + else + { + int64_t dd = (d * d) >> 30; + int64_t ddd = (dd * d) >> 30; + + if (d < 1LL << 30) + coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd + (-18 * (1 << 24) + 12 * B + 6 * C) * dd + (6 * (1 << 24) - 2 * B) * (1 << 30); + else + coeff = (-B - 6 * C) * ddd + (6 * B + 30 * C) * dd + (-12 * B - 48 * C) * d + (8 * B + 24 * C) * (1 << 30); + } + coeff /= (1LL << 54) / fone; + } + else if (flag == 1) // BILINEAR + { + coeff = (1 << 30) - d; + if (coeff < 0) + coeff = 0; + coeff *= fone >> 30; + } + else + assert(0); + + filter[i * filterSize + j] = coeff; + xx++; + } + xDstInSrc += 2 * inc; + } + + //apply src & dst Filter to filter -> filter2 + X265_CHECK(filterSize > 0, "invalid filterSize value.\n"); + filter2Size = filterSize; + filter2 = new int64_t[dstW * sizeof(*filter2) * filter2Size]; + + /* This is hard to read code, but much faster. Speed is crucial here */ + int index = RES_FACTOR_DEF; + int size = dstW * filterSize; + + (size % 4 == 0) && (index = RES_FACTOR_4); + (size % 8 == 0) && (index = RES_FACTOR_8); + (size % 16 == 0) && (index = RES_FACTOR_16); + (size % 32 == 0) && (index = RES_FACTOR_32); + (size % 64 == 0) && (index = RES_FACTOR_64); + + filter_copy_c(filter, filter2, size); + + delete[](filter); + + // try to reduce the filter-size (step1 find size and shift left) + // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not). + minFilterSize = 0; + for (int i = dstW - 1; i >= 0; i--) + { + int min = filter2Size; + int64_t cutOff = 0.0; + + // get rid of near zero elements on the left by shifting left + for (int j = 0; j < filter2Size; j++) + { + int k; + cutOff += X265_ABS(filter2[i * filter2Size]); + + if (cutOff > SCALER_MAX_REDUCE_CUTOFF * fone) + break; + // preserve monotonicity because the core can't handle the filter otherwise + if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1]) + break; + + // move filter coefficients left + for (k = 1; k < filter2Size; k++) + filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k]; + filter2[i * filter2Size + k - 1] = 0; + (*filterPos)[i]++; + } + + cutOff = 0; + // count near zeros on the right + for (int j = filter2Size - 1; j > 0; j--) + { + cutOff += X265_ABS(filter2[i * filter2Size + j]); + + if (cutOff > SCALER_MAX_REDUCE_CUTOFF * fone) + break; + min--; + } + + if (min > minFilterSize) + minFilterSize = min; + } + + X265_CHECK(minFilterSize > 0, "invalid minFilterSize value.\n"); + filterSize = (minFilterSize + (filtAlign - 1)) & (~(filtAlign - 1)); + X265_CHECK(filterSize > 0, "invalid filterSize value.\n"); + filter = new int64_t[dstW*filterSize * sizeof(*filter)]; + + *outFilterSize = filterSize; + + // try to reduce the filter-size (step2 reduce it) + for (int i = 0; i < dstW; i++) + { + for (int j = 0; j < filterSize; j++) + { + if (j >= filter2Size) + filter[i * filterSize + j] = 0; + else + filter[i * filterSize + j] = filter2[i * filter2Size + j]; + if ((flag & SCALER_BITEXACT) && j >= minFilterSize) + filter[i * filterSize + j] = 0; + } + } + + // fix borders + for (int i = 0; i < dstW; i++) + { + int j; + if ((*filterPos)[i] < 0) + { + // move filter coefficients left to compensate for filterPos + for (j = 1; j < filterSize; j++) + { + int left = x265_max(j + (*filterPos)[i], 0); + filter[i * filterSize + left] += filter[i * filterSize + j]; + filter[i * filterSize + j] = 0; + } + (*filterPos)[i] = 0; + } + + if ((*filterPos)[i] + filterSize > srcW) + { + int shift = (*filterPos)[i] + x265_min(filterSize - srcW, 0); + int64_t acc = 0; + + for (j = filterSize - 1; j >= 0; j--) + { + if ((*filterPos)[i] + j >= srcW) + { + acc += filter[i * filterSize + j]; + filter[i * filterSize + j] = 0; + } + } + for (j = filterSize - 1; j >= 0; j--) + { + if (j < shift) + filter[i * filterSize + j] = 0; + else + filter[i * filterSize + j] = filter[i * filterSize + j - shift]; + } + + (*filterPos)[i] -= shift; + filter[i * filterSize + srcW - 1 - (*filterPos)[i]] += acc; + } + + X265_CHECK((*filterPos)[i] >= 0, "invalid: Value of (*filterPos)[%d] < 0.\n", i); + X265_CHECK((*filterPos)[i] < srcW, "invalid: Value of (*filterPos)[%d] > %d .\n", i, srcW); + if ((*filterPos)[i] + filterSize > srcW) + { + for (j = 0; j < filterSize; j++) + { + X265_CHECK(!filter[i * filterSize + j], "invalid: Value of filter[%d * filterSize + %d] != 0.\n", i, j); + X265_CHECK((*filterPos)[i] + j < srcW, "invalid: (*filterPos)[%d] + %d > %d .\n", i, i, srcW); + } + } + } + + // init filter + m_filt = new int16_t[(dstW + 3)*(*outFilterSize)]; + int16_t **outFilter = &m_filt; + + // normalize & store in outFilter + for (int i = 0; i < dstW; i++) + { + int64_t error = 0; + int64_t sum = 0; + + for (int j = 0; j < filterSize; j++) + sum += filter[i * filterSize + j]; + sum = (sum + one / 2) / one; + if (!sum) + { + x265_log(NULL, X265_LOG_WARNING, "Scaler: zero vector in scaling\n"); + sum = 1; + } + for (int j = 0; j < *outFilterSize; j++) + { + int64_t v = filter[i * filterSize + j] + error; + int intV = ROUNDED_DIVISION(v, sum); + (*outFilter)[i * (*outFilterSize) + j] = intV; + error = v - intV * sum; + } + } + + (*filterPos)[dstW + 0] = + (*filterPos)[dstW + 1] = + (*filterPos)[dstW + 2] = (*filterPos)[dstW - 1]; + for (int i = 0; i < *outFilterSize; i++) + { + int k = (dstW - 1) * (*outFilterSize) + i; + (*outFilter)[k + 1 * (*outFilterSize)] = + (*outFilter)[k + 2 * (*outFilterSize)] = + (*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k]; + } + + delete[](filter); + delete[](filter2); + return 0; +} + +int ScalerFilterManager::init(int algorithmFlags, VideoDesc *srcVideoDesc, VideoDesc *dstVideoDesc) +{ + int srcW = m_srcW = srcVideoDesc->m_width; + int srcH = m_srcH = srcVideoDesc->m_height; + int dstW = m_dstW = dstVideoDesc->m_width; + int dstH = m_dstH = dstVideoDesc->m_height; + int lumXInc, crXInc; + int lumYInc, crYInc; + int srcHCrPos; + int dstHCrPos; + int srcVCrPos; + int dstVCrPos; + int dst_stride = SCALER_ALIGN(dstW * sizeof(int16_t) + 66, 16); + m_bitDepth = dstVideoDesc->m_inputDepth; + if (m_bitDepth == 16) + dst_stride <<= 1; + + m_algorithmFlags = algorithmFlags; + lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW; + lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH; + + srcHCrPos = -513; + dstHCrPos = -513; + srcVCrPos = -513; + dstVCrPos = -513; + + int srcCsp = srcVideoDesc->m_csp; + if (x265_cli_csps[srcCsp].planes > 1) + { + m_crSrcHSubSample = x265_cli_csps[srcCsp].width[1]; + m_crSrcVSubSample = x265_cli_csps[srcCsp].height[1]; + m_crSrcW = srcVideoDesc->m_width >> m_crSrcHSubSample; + m_crSrcH = srcVideoDesc->m_height >> m_crSrcVSubSample; + if (srcCsp == 1)// i420 + srcVCrPos = 128; + } + else + { + m_crSrcW = 0; + m_crSrcH = 0; + m_crSrcHSubSample = 0; + m_crSrcVSubSample = 0; + } + int dstCsp = dstVideoDesc->m_csp; + if (x265_cli_csps[dstCsp].planes > 1) + { + m_crDstHSubSample = x265_cli_csps[dstCsp].width[1]; + m_crDstVSubSample = x265_cli_csps[dstCsp].height[1]; + m_crDstW = dstVideoDesc->m_width >> m_crDstHSubSample; + m_crDstH = dstVideoDesc->m_height >> m_crDstVSubSample; + if (dstCsp == 1)// i420 + dstVCrPos = 128; + } + else + { + m_crDstW = 0; + m_crDstH = 0; + m_crDstHSubSample = 0; + m_crDstVSubSample = 0; + } + // Only srcCsp == dstCsp is supported at present + if (srcCsp != dstCsp) + { + x265_log(NULL, X265_LOG_ERROR, "wrong, source csp != destination csp \n"); + return false; + } + + lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW; + lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH; + crXInc = (((int64_t)m_crSrcW << 16) + (m_crDstW >> 1)) / m_crDstW; + crYInc = (((int64_t)m_crSrcH << 16) + (m_crDstH >> 1)) / m_crDstH; + + const int filterAlign = 1; + + // init horizontal Luma Scaler filter + m_ScalerFilters[0] = new ScalerHLumFilter(m_bitDepth); + m_ScalerFilters[0]->initCoeff(m_algorithmFlags, lumXInc, srcW, dstW, filterAlign, 1 << 14, getLocalPos(0, 0), getLocalPos(0, 0)); + + // init horizontal cr Scaler filter + m_ScalerFilters[1] = new ScalerHCrFilter(m_bitDepth); + m_ScalerFilters[1]->initCoeff(m_algorithmFlags, crXInc, m_crSrcW, m_crDstW, filterAlign, 1 << 14, + getLocalPos(m_crSrcHSubSample, srcHCrPos), getLocalPos(m_crDstHSubSample, dstHCrPos)); + + // init vertical Luma scaler filter + m_ScalerFilters[2] = new ScalerVLumFilter(m_bitDepth); + m_ScalerFilters[2]->initCoeff(m_algorithmFlags, lumYInc, srcH, dstH, filterAlign, 1 << 12, getLocalPos(0, 0), getLocalPos(0, 0)); + + // init vertical cr scaler filter + m_ScalerFilters[3] = new ScalerVCrFilter(m_bitDepth); + m_ScalerFilters[3]->initCoeff(m_algorithmFlags, crYInc, m_crSrcH, m_crDstH, filterAlign, 1 << 12, + getLocalPos(m_crSrcVSubSample, srcVCrPos), getLocalPos(m_crDstVSubSample, dstVCrPos)); + + // init slice, must after filter initialization + initScalerSlice(); + + // set slice + m_ScalerFilters[0]->setSlice(m_slices[0], m_slices[1]); + m_ScalerFilters[1]->setSlice(m_slices[0], m_slices[1]); + + m_ScalerFilters[2]->setSlice(m_slices[1], m_slices[2]); + m_ScalerFilters[3]->setSlice(m_slices[1], m_slices[2]); + + return 0; +} + +void HFilterScaler8Bit::doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) +{ + int IdxW = FACTOR_4; + int IdxF = FIL_DEF; + + /* This is hard to read code, but much faster. Speed is crucial here */ + (dstW % 8 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 11) && (IdxF = FIL_11) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 10) && (IdxF = FIL_10) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 9) && (IdxF = FIL_9) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 15) && (IdxF = FIL_15) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 13) && (IdxF = FIL_13) && (IdxW = FACTOR_8); + + /* Do not check multiple of width 4, if width is already multiple of 8 */ + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4); + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4); + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_4); + + (dstW % 4 == 0) && (filterSize == 24) && (IdxF = FIL_24) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 22) && (IdxF = FIL_22) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 19) && (IdxF = FIL_19) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 17) && (IdxF = FIL_17) && (IdxW = FACTOR_4); + +#if X265_DEPTH == 8 + doScaling_c(dst, dstW, src, filter, filterPos, filterSize); +#else + doScaling_c_h(dst, dstW, src, filter, filterPos, filterSize); +#endif +} + +void HFilterScaler10Bit::doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) +{ + int IdxW = FACTOR_4; + int IdxF = FIL_DEF; + + /* This is hard to read code, but much faster. Speed is crucial here */ + (dstW % 8 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 11) && (IdxF = FIL_11) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 10) && (IdxF = FIL_10) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 9) && (IdxF = FIL_9) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 15) && (IdxF = FIL_15) && (IdxW = FACTOR_8); + (dstW % 8 == 0) && (filterSize == 13) && (IdxF = FIL_13) && (IdxW = FACTOR_8); + + /* Do not check multiple of width 4, if width is already multiple of 8 */ + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4); + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4); + !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_4); + + (dstW % 4 == 0) && (filterSize == 24) && (IdxF = FIL_24) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 22) && (IdxF = FIL_22) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 19) && (IdxF = FIL_19) && (IdxW = FACTOR_4); + (dstW % 4 == 0) && (filterSize == 17) && (IdxF = FIL_17) && (IdxW = FACTOR_4); + +#if X265_DEPTH == 8 + doScaling_c(dst, dstW, src, filter, filterPos, filterSize); +#else + doScaling_c_h(dst, dstW, src, filter, filterPos, filterSize); +#endif +} + +int ScalerFilterManager::scale_pic(void ** src, void ** dst, int * srcStride, int * dstStride) +{ + uint8_t** src_8bit, **dst_8bit; + src_8bit = (uint8_t**)src; + dst_8bit = (uint8_t**)dst; + if (!src_8bit || !dst_8bit) + return -1; + + const int srcsliceHor = m_srcH; + const int dstW = m_dstW; + const int dstH = m_dstH; + int32_t *vLumFilterPos = m_ScalerFilters[2]->m_filtPos; + int32_t *vCrFilterPos = m_ScalerFilters[3]->m_filtPos; + const int vLumFilterSize = m_ScalerFilters[2]->m_filtLen; + const int vCrFilterSize = m_ScalerFilters[3]->m_filtLen; + const int crSrcsliceHor = UH_CEIL_SHIFTR(srcsliceHor, m_crSrcVSubSample); + + // vars which will change and which we need to store back in the context + int lumBufIndex = -1; + int crBufIndex = -1; + int lastInLumBuf = -1; + int lastInCrBuf = -1; + + int hasLumHoles = 1; + int hasCrHoles = 1; + + ScalerSlice *src_slice = m_slices[0]; + ScalerSlice *hout_slice = m_slices[1]; + ScalerSlice *vout_slice = m_slices[2]; + src_slice->initFromSrc((uint8_t**)src, srcStride, m_srcW, 0, srcsliceHor, 0, crSrcsliceHor, 1); + vout_slice->initFromSrc((uint8_t**)dst, dstStride, m_dstW, 0, dstH, 0, UH_CEIL_SHIFTR(dstH, m_crDstVSubSample), 0); + + hout_slice->m_plane[0].sliceVer = 0; + hout_slice->m_plane[1].sliceVer = 0; + hout_slice->m_plane[2].sliceVer = 0; + hout_slice->m_plane[3].sliceVer = 0; + hout_slice->m_plane[0].sliceHor = 0; + hout_slice->m_plane[1].sliceHor = 0; + hout_slice->m_plane[2].sliceHor = 0; + hout_slice->m_plane[3].sliceHor = 0; + hout_slice->m_width = dstW; + + for (int dstY = 0; dstY < dstH; dstY++) + { + const int crDstY = dstY >> m_crDstVSubSample; + const int firstLumSrcY = x265_max(1 - vLumFilterSize, vLumFilterPos[dstY]); + const int firstLumSrcY2 = x265_max(1 - vLumFilterSize, vLumFilterPos[x265_min(dstY | ((1 << m_crDstVSubSample) - 1), dstH - 1)]); + const int firstCrSrcY = x265_max(1 - vCrFilterSize, vCrFilterPos[crDstY]); + + int lastLumSrcY = x265_min(m_srcH, firstLumSrcY + vLumFilterSize) - 1; + int lastLumSrcY2 = x265_min(m_srcH, firstLumSrcY2 + vLumFilterSize) - 1; + int lastCrSrcY = x265_min(m_crSrcH, firstCrSrcY + vCrFilterSize) - 1; + + // handle holes + if (firstLumSrcY > lastInLumBuf) + { + hasLumHoles = lastInLumBuf != firstLumSrcY - 1; + if (hasLumHoles) + { + hout_slice->m_plane[0].sliceVer = firstLumSrcY; + hout_slice->m_plane[3].sliceVer = firstLumSrcY; + hout_slice->m_plane[0].sliceHor = + hout_slice->m_plane[3].sliceHor = 0; + } + + lastInLumBuf = firstLumSrcY - 1; + } + if (firstCrSrcY > lastInCrBuf) + { + hasCrHoles = lastInCrBuf != firstCrSrcY - 1; + if (hasCrHoles) + { + hout_slice->m_plane[1].sliceVer = firstCrSrcY; + hout_slice->m_plane[2].sliceVer = firstCrSrcY; + hout_slice->m_plane[1].sliceHor = + hout_slice->m_plane[2].sliceHor = 0; + } + + lastInCrBuf = firstCrSrcY - 1; + } + + // Do we have enough lines in this slice to output the dstY line + int enoughLines = lastLumSrcY2 < 0 + srcsliceHor && lastCrSrcY < UH_CEIL_SHIFTR(0 + srcsliceHor, m_crSrcVSubSample); + if (!enoughLines) + { + lastLumSrcY = 0 + srcsliceHor - 1; + lastCrSrcY = 0 + crSrcsliceHor - 1; + x265_log(NULL, X265_LOG_INFO, "buffering slice: lastLumSrcY %d lastCrSrcY %d\n", lastLumSrcY, lastCrSrcY); + } + + X265_CHECK(((lastLumSrcY - firstLumSrcY + 1) <= hout_slice->m_plane[0].availLines), "invalid value %d", lastLumSrcY - firstLumSrcY + 1); + X265_CHECK((lastCrSrcY - firstCrSrcY + 1) <= hout_slice->m_plane[1].availLines, "invalid value %d", lastCrSrcY - firstCrSrcY + 1); + + int firstPosY, lastPosY, firstCPosY, lastCPosY; + int posY = hout_slice->m_plane[0].sliceVer + hout_slice->m_plane[0].sliceHor; + if (posY <= lastLumSrcY && !hasLumHoles) + { + firstPosY = x265_max(firstLumSrcY, posY); + lastPosY = x265_min(firstLumSrcY + hout_slice->m_plane[0].availLines - 1, 0 + srcsliceHor - 1); + } + else + { + firstPosY = posY; + lastPosY = lastLumSrcY; + } + + int cPosY = hout_slice->m_plane[1].sliceVer + hout_slice->m_plane[1].sliceHor; + if (cPosY <= lastCrSrcY && !hasCrHoles) + { + firstCPosY = x265_max(firstCrSrcY, cPosY); + lastCPosY = x265_min(firstCrSrcY + hout_slice->m_plane[1].availLines - 1, UH_CEIL_SHIFTR(0 + srcsliceHor, m_crSrcVSubSample) - 1); + } + else + { + firstCPosY = cPosY; + lastCPosY = lastCrSrcY; + } + + hout_slice->rotate(lastPosY, lastCPosY); + // horizontal luma scale + if (posY < lastLumSrcY + 1) + m_ScalerFilters[0]->process(firstPosY, lastPosY - firstPosY + 1); + + lumBufIndex += lastLumSrcY - lastInLumBuf; + lastInLumBuf = lastLumSrcY; + // horizontal chroma Scale + if (cPosY < lastCrSrcY + 1) + m_ScalerFilters[1]->process(firstCPosY, lastCPosY - firstCPosY + 1); + + crBufIndex += lastCrSrcY - lastInCrBuf; + lastInCrBuf = lastCrSrcY; + + // wrap buf index around to stay inside the ring buffer + if (lumBufIndex >= vLumFilterSize) + lumBufIndex -= vLumFilterSize; + if (crBufIndex >= vCrFilterSize) + crBufIndex -= vCrFilterSize; + if (!enoughLines) + break; // we can't output a dstY line so let's try with the next slice + + // vertical scale(output converter) + for (int i = 2; i < m_numFilter; ++i) + m_ScalerFilters[i]->process(dstY, 1); + } + return 0; +} + +void ScalerFilterManager::getMinBufferSize(int *out_lum_size, int *out_cr_size) +{ + int lumY; + int dstH = m_dstH; + int crDstH = m_crDstH; + int *lumFilterPos = m_ScalerFilters[2]->m_filtPos; + int *crFilterPos = m_ScalerFilters[3]->m_filtPos; + int lumFilterSize = m_ScalerFilters[2]->m_filtLen; + int crFilterSize = m_ScalerFilters[3]->m_filtLen; + int crSubSample = m_crSrcVSubSample; + + *out_lum_size = lumFilterSize; + *out_cr_size = crFilterSize; + + for (lumY = 0; lumY < dstH; lumY++) + { + int crY = (int64_t)lumY * crDstH / dstH; + int nextSlice = x265_max(lumFilterPos[lumY] + lumFilterSize - 1, ((crFilterPos[crY] + crFilterSize - 1) << crSubSample)); + + nextSlice >>= crSubSample; + nextSlice <<= crSubSample; + (*out_lum_size) = x265_max((*out_lum_size), nextSlice - lumFilterPos[lumY]); + (*out_cr_size) = x265_max((*out_cr_size), (nextSlice >> crSubSample) - crFilterPos[crY]); + } +} + +int ScalerFilterManager::initScalerSlice() +{ + int ret = 0; + int dst_stride = SCALER_ALIGN(m_dstW * sizeof(int16_t) + 66, 16); + if (m_bitDepth == 16) + dst_stride <<= 1; + + int lumBufSize; + int crBufSize; + int vLumFilterSize = m_ScalerFilters[2]->m_filtLen; // Vertical filter size for luma pixels. + int vCrFilterSize = m_ScalerFilters[3]->m_filtLen; // Vertical filter size for chroma pixels. + getMinBufferSize(&lumBufSize, &crBufSize); + lumBufSize = X265_MAX(lumBufSize, vLumFilterSize + MAX_NUM_LINES_AHEAD); + crBufSize = X265_MAX(crBufSize, vCrFilterSize + MAX_NUM_LINES_AHEAD); + + for (int i = 0; i < m_numSlice; i++) + m_slices[i] = new ScalerSlice; + ret = m_slices[0]->create(m_srcH, m_crSrcH, m_crSrcHSubSample, m_crSrcVSubSample, 0); + if (ret < 0) + { + x265_log(NULL, X265_LOG_ERROR, "alloc_slice m_slice[0] failed\n"); + return -1; + } + + // horizontal scaler output + ret = m_slices[1]->create(lumBufSize, crBufSize, m_crDstHSubSample, m_crDstVSubSample, 1); + if (ret < 0) + { + x265_log(NULL, X265_LOG_ERROR, "m_slice[1].create failed\n"); + return -1; + } + ret = m_slices[1]->createLines(dst_stride, m_dstW); + if (ret < 0) + { + x265_log(NULL, X265_LOG_ERROR, "m_slice[1].createLines failed\n"); + return -1; + } + + m_slices[1]->fillOnes(dst_stride >> 1, m_bitDepth == 16); + + // vertical scaler output + ret = m_slices[2]->create(m_dstH, m_crDstH, m_crDstHSubSample, m_crDstVSubSample, 0); + if (ret < 0) + { + x265_log(NULL, X265_LOG_ERROR, "m_slice[2].create failed\n"); + return -1; + } + + return 0; +} + +int ScalerFilterManager::getLocalPos(int crSubSample, int pos) +{ + if (pos == -1 || pos <= -513) + pos = (128 << crSubSample) - 128; + pos += 128; // relative to ideal left edge + return pos >> crSubSample; +} + +ScalerSlice::ScalerSlice() : + m_width(0), + m_hCrSubSample(0), + m_vCrSubSample(0), + m_isRing(0), + m_destroyLines(0) +{ + for (int i = 0; i < m_numSlicePlane; i++) + { + m_plane[i].availLines = 0; + m_plane[i].sliceVer = 0; + m_plane[i].sliceHor = 0; + m_plane[i].lineBuf = NULL; + } +} + +void ScalerSlice::destroy() +{ + if (m_destroyLines) + destroyLines(); + for (int i = 0; i < m_numSlicePlane; i++) + { + if (m_plane[i].lineBuf) + X265_FREE(m_plane[i].lineBuf); + } +} + +int ScalerSlice::create(int lumLines, int crLines, int h_sub_sample, int v_sub_sample, int ring) +{ + int i; + int size[4] = { lumLines, crLines, crLines, lumLines }; + + m_hCrSubSample = h_sub_sample; + m_vCrSubSample = v_sub_sample; + m_isRing = ring; + m_destroyLines = 0; + + for (i = 0; i < m_numSlicePlane; ++i) + { + int n = size[i] * (ring == 0 ? 1 : 3); + m_plane[i].lineBuf = X265_MALLOC(uint8_t*, n); + if (!m_plane[i].lineBuf) + return -1; + + m_plane[i].availLines = size[i]; + m_plane[i].sliceVer = 0; + m_plane[i].sliceHor = 0; + } + return 0; +} + +/* +slice lines contains extra bytes for vectorial code thus @size +is the allocated memory size and @width is the number of pixels +*/ +int ScalerSlice::createLines(int size, int width) +{ + int i; + int idx[2] = { 3, 2 }; + + m_destroyLines = 1; + m_width = width; + + for (i = 0; i < 2; ++i) { + int n = m_plane[i].availLines; + int j; + int ii = idx[i]; + assert(n == m_plane[ii].availLines); + for (j = 0; j < n; ++j) + { + // chroma plane line U and V are expected to be contiguous in memory + m_plane[i].lineBuf[j] = (uint8_t*)X265_MALLOC(uint8_t, size * 2 + 32); + if (!m_plane[i].lineBuf[j]) + { + destroyLines(); + return -1; + } + m_plane[ii].lineBuf[j] = m_plane[i].lineBuf[j] + size + 16; + if (m_isRing) + { + m_plane[i].lineBuf[j + n] = m_plane[i].lineBuf[j]; + m_plane[ii].lineBuf[j + n] = m_plane[ii].lineBuf[j]; + } + } + } + + return 0; +} + +void ScalerSlice::destroyLines() +{ + int i; + for (i = 0; i < 2; ++i) + { + int n = m_plane[i].availLines; + int j; + for (j = 0; j < n; ++j) + { + X265_FREE(m_plane[i].lineBuf[j]); + m_plane[i].lineBuf[j] = NULL; + if (m_isRing) + m_plane[i].lineBuf[j + n] = NULL; + } + } + + for (i = 0; i < m_numSlicePlane; ++i) + memset(m_plane[i].lineBuf, 0, sizeof(uint8_t*) * m_plane[i].availLines * (m_isRing ? 3 : 1)); + m_destroyLines = 0; +} + +void ScalerSlice::fillOnes(int n, int is16bit) +{ + int i; + for (i = 0; i < m_numSlicePlane; ++i) + { + int j; + int size = m_plane[i].availLines; + for (j = 0; j < size; ++j) + { + int k; + int end = is16bit ? n >> 1 : n; + // fill also one extra element + end += 1; + if (is16bit) + for (k = 0; k < end; ++k) + ((int32_t*)(m_plane[i].lineBuf[j]))[k] = 1 << 18; + else + for (k = 0; k < end; ++k) + ((int16_t*)(m_plane[i].lineBuf[j]))[k] = 1 << 14; + } + } +} + +int ScalerSlice::rotate(int lum, int cr) +{ + int i; + if (lum) + { + for (i = 0; i < m_numSlicePlane; i += 3) + { + int n = m_plane[i].availLines; + int l = lum - m_plane[i].sliceVer; + + if (l >= n * 2) + { + m_plane[i].sliceVer += n; + m_plane[i].sliceHor -= n; + } + } + } + if (cr) + { + for (i = 1; i < 3; ++i) + { + int n = m_plane[i].availLines; + int l = cr - m_plane[i].sliceVer; + + if (l >= n * 2) + { + m_plane[i].sliceVer += n; + m_plane[i].sliceHor -= n; + } + } + } + return 0; +} + +int ScalerSlice::initFromSrc(uint8_t *src[4], const int stride[4], int srcW, int lumY, int lumH, int crY, int crH, int relative) +{ + int i = 0; + + const int start[m_numSlicePlane] = { lumY, crY, crY, lumY }; + + const int end[m_numSlicePlane] = { lumY + lumH, crY + crH, crY + crH, lumY + lumH }; + + uint8_t *const src_[m_numSlicePlane] = { src[0] + (relative ? 0 : start[0]) * stride[0], + src[1] + (relative ? 0 : start[1]) * stride[1], + src[2] + (relative ? 0 : start[2]) * stride[2], + src[3] + (relative ? 0 : start[3]) * stride[3] }; + + m_width = srcW; + + for (i = 0; i < m_numSlicePlane; ++i) + { + int j; + int first = m_plane[i].sliceVer; + int n = m_plane[i].availLines; + int lines = end[i] - start[i]; + int tot_lines = end[i] - first; + + if (start[i] >= first && n >= tot_lines) + { + m_plane[i].sliceHor = x265_max(tot_lines, m_plane[i].sliceHor); + for (j = 0; j < lines; j += 1) + m_plane[i].lineBuf[start[i] - first + j] = src_[i] + j * stride[i]; + } + else + { + m_plane[i].sliceVer = start[i]; + lines = lines > n ? n : lines; + m_plane[i].sliceHor = lines; + for (j = 0; j < lines; j += 1) + m_plane[i].lineBuf[j] = src_[i] + j * stride[i]; + } + } + return 0; +} +} diff -r 00b686782ad0 -r 96739081845a source/common/scaler.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/source/common/scaler.h Mon Aug 26 15:34:49 2019 +0530 @@ -0,0 +1,254 @@ +/***************************************************************************** + * Copyright (C) 2020 MulticoreWare, Inc + * + * Authors: Pooja Venkatesan <po...@multicorewareinc.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_SCALER_H +#define X265_SCALER_H + +#include "common.h" + +namespace X265_NS { +//x265 private namespace + +class ScalerSlice; +class VideoDesc; + +#define MAX_NUM_LINES_AHEAD 4 +#define SCALER_ALIGN(x, j) (((x)+(j)-1)&~((j)-1)) +#define X265_ABS(j) ((j) >= 0 ? (j) : (-(j))) +#define SCALER_MAX_REDUCE_CUTOFF 0.002 +#define SCALER_BITEXACT 0x80000 +#define ROUNDED_DIVISION(i,j) (((i)>0 ? (i) + ((j)>>1) : (i) - ((j)>>1))/(j)) +#define UH_CEIL_SHIFTR(i,j) (!scale_builtin_constant_p(j) ? -((-(i)) >> (j)) \ + : ((i) + (1<<(j)) - 1) >> (j)) + +#if defined(__GNUC__) || defined(__clang__) +# define scale_builtin_constant_p __builtin_constant_p +#else +# define scale_builtin_constant_p(x) 0 +#endif + +enum ResFactor +{ + RES_FACTOR_64, RES_FACTOR_32, RES_FACTOR_16, RES_FACTOR_8, + RES_FACTOR_4, RES_FACTOR_DEF, NUM_RES_FACTOR +}; + +enum ScalerFactor +{ + FACTOR_4, FACTOR_8, NUM_FACTOR +}; + +enum FilterSize +{ + FIL_4, FIL_6, FIL_8, FIL_9, FIL_10, FIL_11, FIL_13, FIL_15, + FIL_16, FIL_17, FIL_19, FIL_22, FIL_24, FIL_DEF, NUM_FIL +}; + +class ScalerFilter { +public: + int m_filtLen; + int32_t* m_filtPos; // Array of horizontal/vertical starting pos for each dst for luma / chroma planes. + int16_t* m_filt; // Array of horizontal/vertical filter coefficients for luma / chroma planes. + ScalerSlice* m_sourceSlice; // Source slice + ScalerSlice* m_destSlice; // Output slice + ScalerFilter(); + virtual ~ScalerFilter(); + virtual void process(int sliceVer, int sliceHor) = 0; + int initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos); + void setSlice(ScalerSlice* source, ScalerSlice* dest) { m_sourceSlice = source; m_destSlice = dest; } +}; + +class VideoDesc { +public: + int m_width; + int m_height; + int m_csp; + int m_inputDepth; + + VideoDesc(int w, int h, int csp, int bitDepth) + { + m_width = w; + m_height = h; + m_csp = csp; + m_inputDepth = bitDepth; + } +}; + +typedef struct ScalerPlane +{ + int availLines; // max number of lines that can be held by this plane + int sliceVer; // index of first line + int sliceHor; // number of lines + uint8_t** lineBuf; // line buffer +} ScalerPlane; + +// Assist horizontal filtering, base class +class HFilterScaler { +public: + int m_bitDepth; +public: + HFilterScaler() :m_bitDepth(0) {}; + virtual ~HFilterScaler() {}; + virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) = 0; +}; + +// Assist vertical filtering, base class +class VFilterScaler { +public: + int m_bitDepth; +public: + VFilterScaler() :m_bitDepth(0) {}; + virtual ~VFilterScaler() {}; + virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) = 0; +}; + +// Assist horizontal filtering, process 8 bit case +class HFilterScaler8Bit : public HFilterScaler { +public: + HFilterScaler8Bit() { m_bitDepth = 8; } + virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); +}; + +// Assist horizontal filtering, process 10 bit case +class HFilterScaler10Bit : public HFilterScaler { +public: + HFilterScaler10Bit() { m_bitDepth = 10; } + virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); +}; + +// Assist vertical filtering, process 8 bit case +class VFilterScaler8Bit : public VFilterScaler { +public: + VFilterScaler8Bit() { m_bitDepth = 8; } + virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW); +}; + +// Assist vertical filtering, process 10 bit case +class VFilterScaler10Bit : public VFilterScaler { +public: + VFilterScaler10Bit() { m_bitDepth = 10; } + virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW); +}; + +// Horizontal filter for luma +class ScalerHLumFilter : public ScalerFilter { +private: + HFilterScaler* m_hFilterScaler; +public: + ScalerHLumFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;} + ~ScalerHLumFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); } + virtual void process(int sliceVer, int sliceHor); +}; + +// Horizontal filter for chroma +class ScalerHCrFilter : public ScalerFilter { +private: + HFilterScaler* m_hFilterScaler; +public: + ScalerHCrFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;} + ~ScalerHCrFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); } + virtual void process(int sliceVer, int sliceHor); +}; + +// Vertical filter for luma +class ScalerVLumFilter : public ScalerFilter { +private: + VFilterScaler* m_vFilterScaler; +public: + ScalerVLumFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;} + ~ScalerVLumFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); } + virtual void process(int sliceVer, int sliceHor); +}; + +// Vertical filter for chroma +class ScalerVCrFilter : public ScalerFilter { +private: + VFilterScaler* m_vFilterScaler; +public: + ScalerVCrFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;} + ~ScalerVCrFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); } + virtual void process(int sliceVer, int sliceHor); +}; + +class ScalerSlice +{ +private: + enum ScalerSlicePlaneNum { m_numSlicePlane = 4 }; +public: + int m_width; // Slice line width + int m_hCrSubSample; // horizontal Chroma subsampling factor + int m_vCrSubSample; // vertical chroma subsampling factor + int m_isRing; // flag to identify if this ScalerSlice is a ring buffer + int m_destroyLines; // flag to identify if there are dynamic allocated lines + ScalerPlane m_plane[m_numSlicePlane]; +public: + ScalerSlice(); + ~ScalerSlice() { destroy(); } + int rotate(int lum, int cr); + void fillOnes(int n, int is16bit); + int create(int lumLines, int crLines, int h_sub_sample, int v_sub_sample, int ring); + int createLines(int size, int width); + void destroyLines(); + void destroy(); + int initFromSrc(uint8_t *src[4], const int stride[4], int srcW, int lumY, int lumH, int crY, int crH, int relative); +}; + +class ScalerFilterManager { +private: + enum ScalerFilterNum { m_numSlice = 3, m_numFilter = 4 }; + +private: + int m_bitDepth; + int m_algorithmFlags; // 1, bilinear; 4 bicubic, default is bicubic + int m_srcW; // Width of source luma planes. + int m_srcH; // Height of source luma planes. + int m_dstW; // Width of dest luma planes. + int m_dstH; // Height of dest luma planes. + int m_crSrcW; // Width of source chroma planes. + int m_crSrcH; // Height of source chroma planes. + int m_crDstW; // Width of dest chroma planes. + int m_crDstH; // Height of dest chroma planes. + int m_crSrcHSubSample; // Binary log of horizontal subsampling factor between Y and Cr planes in src image. + int m_crSrcVSubSample; // Binary log of vertical subsampling factor between Y and Cr planes in src image. + int m_crDstHSubSample; // Binary log of horizontal subsampling factor between Y and Cr planes in dest image. + int m_crDstVSubSample; // Binary log of vertical subsampling factor between Y and Cr planes in dest image. + ScalerSlice* m_slices[m_numSlice]; + ScalerFilter* m_ScalerFilters[m_numFilter]; +private: + int getLocalPos(int crSubSample, int pos); + void getMinBufferSize(int *out_lum_size, int *out_cr_size); + int initScalerSlice(); +public: + ScalerFilterManager(); + ~ScalerFilterManager() { + for (int i = 0; i < m_numSlice; i++) + if (m_slices[i]) { m_slices[i]->destroy(); delete m_slices[i]; m_slices[i] = NULL; } + for (int i = 0; i < m_numFilter; i++) + if (m_ScalerFilters[i]) { delete m_ScalerFilters[i]; m_ScalerFilters[i] = NULL; } + } + int init(int algorithmFlags, VideoDesc* srcVideoDesc, VideoDesc* dstVideoDesc); + int scale_pic(void** src, void** dst, int* srcStride, int* dstStride); +}; +} + +#endif //ifndef X265_SCALER_H
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel