# HG changeset patch
# User Vignesh Vijayakumar
# Date 1507092075 -19800
#      Wed Oct 04 10:11:15 2017 +0530
# Node ID a78accbf7387dfe43ba59367b286af31d77e7c8f
# Parent  44433ded38d00c79fa52e69e7c5c5127009f9ede
x86: Link add_ps_aligned primitive to encoder

diff -r 44433ded38d0 -r a78accbf7387 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp       Fri Oct 06 14:00:56 2017 +0530
+++ b/source/encoder/analysis.cpp       Wed Oct 04 10:11:15 2017 +0530
@@ -3325,8 +3325,17 @@
          * resiYuv. Generate the recon pixels by adding it to the prediction */
 
         if (cu.m_cbf[0][0])
-            primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_stride,
-                                          predY, resiYuv.m_buf[0], 
predYuv.m_size, resiYuv.m_size);
+        {
+            bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + 
reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
+            bool predYalign = predYuv.getAddrOffset(absPartIdx, 
predYuv.m_size) % 64 == 0;
+            if (reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) 
&& (predYuv.m_size % 64 == 0) && (resiYuv.m_size % 64 == 0) && 
+                reconPic.m_param->cpuid & X265_CPU_AVX512)
+                
primitives.cu[sizeIdx].add_ps_aligned(reconPic.getLumaAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_stride,
+                    predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
+            else
+                
primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), 
reconPic.m_stride,
+                    predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
+        }
         else
             primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_stride,
                                            predY, predYuv.m_size);
@@ -3334,16 +3343,34 @@
         {
              pixel* predU = predYuv.getCbAddr(absPartIdx);
              pixel* predV = predYuv.getCrAddr(absPartIdx);
-            if (cu.m_cbf[1][0])
-                
primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_strideC,
-                                                        predU, 
resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
+             if (cu.m_cbf[1][0])
+             {
+                 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + 
reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
+                 bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 
64 == 0;
+                 if (reconPicAlign && predUalign && (reconPic.m_strideC % 64 
== 0) && (predYuv.m_csize % 64 == 0) && (resiYuv.m_csize % 64 == 0) &&
+                     reconPic.m_param->cpuid & X265_CPU_AVX512)
+                     
primitives.chroma[m_csp].cu[sizeIdx].add_ps_aligned(reconPic.getCbAddr(cu.m_cuAddr,
 absPartIdx), reconPic.m_strideC,
+                         predU, resiYuv.m_buf[1], predYuv.m_csize, 
resiYuv.m_csize);
+                 else
+                     
primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_strideC,
+                         predU, resiYuv.m_buf[1], predYuv.m_csize, 
resiYuv.m_csize);
+             }
             else
                 
primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_strideC,
                                                          predU, 
predYuv.m_csize);
 
-            if (cu.m_cbf[2][0])
-                
primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_strideC,
-                                                        predV, 
resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
+             if (cu.m_cbf[2][0])
+             {
+                 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + 
reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
+                 bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 
64 == 0;
+                 if (reconPicAlign && predValign && (reconPic.m_strideC % 64 
== 0) && (predYuv.m_csize % 64 == 0) && (resiYuv.m_csize % 64 == 0) &&
+                     reconPic.m_param->cpuid & X265_CPU_AVX512)
+                     
primitives.chroma[m_csp].cu[sizeIdx].add_ps_aligned(reconPic.getCrAddr(cu.m_cuAddr,
 absPartIdx), reconPic.m_strideC,
+                         predV, resiYuv.m_buf[2], predYuv.m_csize, 
resiYuv.m_csize);
+                 else
+                     
primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_strideC,
+                         predV, resiYuv.m_buf[2], predYuv.m_csize, 
resiYuv.m_csize);
+             }
             else
                 
primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, 
absPartIdx), reconPic.m_strideC,
                                                          predV, 
predYuv.m_csize);
diff -r 44433ded38d0 -r a78accbf7387 source/encoder/search.cpp
--- a/source/encoder/search.cpp Fri Oct 06 14:00:56 2017 +0530
+++ b/source/encoder/search.cpp Wed Oct 04 10:11:15 2017 +0530
@@ -363,7 +363,13 @@
         if (numSig)
         {
             m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, 
TEXT_LUMA, true, false, numSig);
-            primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, 
residual, stride, stride);
+            bool reconQtYuvAlign = 
m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 
== 0;
+            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, 
mode.predYuv.m_size) % 64 == 0;
+            bool residualAlign = 
m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 
64 == 0;
+            if (reconQtStride % 64 == 0 && stride % 64 == 0 && reconQtYuvAlign 
&& predAlign && residualAlign && cu.m_encData->m_param->cpuid & X265_CPU_AVX512)
+                primitives.cu[sizeIdx].add_ps_aligned(reconQt, reconQtStride, 
pred, residual, stride, stride);
+            else
+                primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, 
residual, stride, stride);
         }
         else
             // no coded residual, recon = pred
@@ -561,6 +567,7 @@
 
         coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
         pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
+        bool tmpReconAlign = (useTSkip ? 1 : 
(m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, 
m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
         uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 
         if ((stride % 64 == 0) && (m_param->cpuid & X265_CPU_AVX512))
@@ -572,7 +579,12 @@
         if (numSig)
         {
             m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, 
TEXT_LUMA, true, useTSkip, numSig);
-            primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, 
residual, stride, stride);
+            bool residualAlign = 
m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, 
m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
+            bool predAlign = predYuv->getAddrOffset(absPartIdx, 
predYuv->m_size) % 64 == 0;
+            if (stride % 64 == 0 && tmpReconStride % 64 == 0 && tmpReconAlign 
&& residualAlign && predAlign && m_param->cpuid & X265_CPU_AVX512)
+                primitives.cu[sizeIdx].add_ps_aligned(tmpRecon, 
tmpReconStride, pred, residual, stride, stride);
+            else
+                primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, 
residual, stride, stride);
         }
         else if (useTSkip)
         {
@@ -732,7 +744,13 @@
         if (numSig)
         {
             m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, 
TEXT_LUMA, true, false, numSig);
-            primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, 
residual, stride, stride);
+            bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + 
reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
+            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, 
mode.predYuv.m_size) % 64 == 0;
+            bool residualAlign = 
m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, 
m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
+            if (picStride % 64 == 0 && stride % 64 == 0 && picReconYAlign && 
predAlign && residualAlign && m_param->cpuid & X265_CPU_AVX512)
+                primitives.cu[sizeIdx].add_ps_aligned(picReconY, picStride, 
pred, residual, stride, stride);
+            else
+                primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, 
residual, stride, stride);
             cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
         }
         else
@@ -910,7 +928,13 @@
             if (numSig)
             {
                 m_quant.invtransformNxN(cu, residual, stride, coeffC, 
log2TrSizeC, ttype, true, false, numSig);
-                primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, 
residual, stride, stride);
+                bool reconQtAlign = 
m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) 
% 64 == 0;
+                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) 
% 64 == 0;
+                if (reconQtAlign && predAlign && residualAlign && 
(reconQtStride % 64 == 0) && (stride % 64 == 0) && m_param->cpuid & 
X265_CPU_AVX512)
+                    primitives.cu[sizeIdxC].add_ps_aligned(reconQt, 
reconQtStride, pred, residual, stride, stride);
+                else
+                    primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, 
pred, residual, stride, stride);
                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, 
tuIterator.absPartIdxStep);
             }
             else
@@ -1013,7 +1037,13 @@
                 if (numSig)
                 {
                     m_quant.invtransformNxN(cu, residual, stride, coeff, 
log2TrSizeC, ttype, true, useTSkip, numSig);
-                    primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, 
residual, stride, stride);
+                    bool reconAlign = (useTSkip ? 1 : 
m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
+                    bool predYuvAlign = 
mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+                    bool residualAlign = 
m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+                    if (reconAlign && predYuvAlign && residualAlign && 
(reconStride % 64 == 0) && (stride % 64 == 0) && m_param->cpuid & 
X265_CPU_AVX512)
+                        primitives.cu[sizeIdxC].add_ps_aligned(recon, 
reconStride, pred, residual, stride, stride);
+                    else
+                        primitives.cu[sizeIdxC].add_ps(recon, reconStride, 
pred, residual, stride, stride);
                     cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, 
tuIterator.absPartIdxStep);
                 }
                 else if (useTSkip)
@@ -1207,7 +1237,13 @@
             if (numSig)
             {
                 m_quant.invtransformNxN(cu, residual, stride, coeffC, 
log2TrSizeC, ttype, true, false, numSig);
-                primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, 
residual, stride, stride);
+                bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + 
reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
+                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) 
% 64 == 0;
+                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 
64 == 0;
+                if (picReconCAlign && predAlign && residualAlign && (picStride 
% 64 == 0) && (stride % 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+                    primitives.cu[sizeIdxC].add_ps_aligned(picReconC, 
picStride, pred, residual, stride, stride);
+                else
+                    primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, 
residual, stride, stride);
                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, 
tuIterator.absPartIdxStep);
             }
             else
@@ -3223,8 +3259,14 @@
             // non-zero cost calculation for luma - This is an approximation
             // finally we have to encode correct cbf after comparing with null 
cost
             pixel* curReconY = 
m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
+            bool curReconYAlign = 
m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, 
m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
             uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
-            primitives.cu[partSize].add_ps(curReconY, strideReconY, 
mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, 
strideResiY);
+            bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, 
mode.predYuv.m_size) % 64 == 0;
+            bool curResiYAlign = 
m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, 
m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
+            if (curReconYAlign && predYuvAlign && curResiYAlign && 
(strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 
64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+                primitives.cu[partSize].add_ps_aligned(curReconY, 
strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, 
mode.predYuv.m_size, strideResiY);
+            else
+                primitives.cu[partSize].add_ps(curReconY, strideReconY, 
mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, 
strideResiY);
 
             const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, 
fencYuv->m_size, curReconY, strideReconY);
             uint32_t nzCbfBitsY = 
m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
@@ -3345,7 +3387,13 @@
                         // finally we have to encode correct cbf after 
comparing with null cost
                         pixel* curReconC      = 
m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
                         uint32_t strideReconC = 
m_rqt[qtLayer].reconQtYuv.m_csize;
-                        primitives.cu[partSizeC].add_ps(curReconC, 
strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, 
mode.predYuv.m_csize, strideResiC);
+                        bool curReconCAlign = 
m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+                        bool predYuvAlign = 
mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+                        bool curResiCAlign = 
m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+                        if (curReconCAlign && predYuvAlign && curResiCAlign && 
(strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 
64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+                            primitives.cu[partSizeC].add_ps_aligned(curReconC, 
strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, 
mode.predYuv.m_csize, strideResiC);
+                        else
+                            primitives.cu[partSizeC].add_ps(curReconC, 
strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, 
mode.predYuv.m_csize, strideResiC);
                         sse_t nonZeroDistC = 
m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, 
fencYuv->m_csize, curReconC, strideReconC));
                         uint32_t nzCbfBitsC = 
m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], 
(TextType)chromaId, tuDepth);
                         uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
@@ -3455,8 +3503,12 @@
                 const uint32_t skipSingleBitsY = 
m_entropyCoder.getNumberOfWrittenBits();
 
                 m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, 
log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
-
-                primitives.cu[partSize].add_ps(m_tsRecon, trSize, 
mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, 
trSize);
+                bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, 
mode.predYuv.m_size) % 64 == 0;
+
+                if (predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size 
% 64 == 0) && m_param->cpuid & X265_CPU_AVX512)
+                    primitives.cu[partSize].add_ps_aligned(m_tsRecon, trSize, 
mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, 
trSize);
+                else
+                    primitives.cu[partSize].add_ps(m_tsRecon, trSize, 
mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, 
trSize);
                 nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, 
fencYuv->m_size, m_tsRecon, trSize);
 
                 if (m_rdCost.m_psyRd)
@@ -3533,7 +3585,11 @@
 
                         m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, 
m_tsCoeff,
                                                 log2TrSizeC, 
(TextType)chromaId, false, true, numSigTSkipC);
-                        primitives.cu[partSizeC].add_ps(m_tsRecon, trSizeC, 
mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, 
mode.predYuv.m_csize, trSizeC);
+                        bool predYuvAlign = 
mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
+                        if (predYuvAlign && (trSizeC % 64 == 0) && 
(mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0) && m_param->cpuid & 
X265_CPU_AVX512)
+                            primitives.cu[partSizeC].add_ps_aligned(m_tsRecon, 
trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, 
mode.predYuv.m_csize, trSizeC);
+                        else
+                            primitives.cu[partSizeC].add_ps(m_tsRecon, 
trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, 
mode.predYuv.m_csize, trSizeC);
                         nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, 
primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
                         if (m_rdCost.m_psyRd)
                         {
_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to