http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java index e99c2ef..f81e929 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java @@ -18,26 +18,13 @@ */ package org.apache.sysml.runtime.matrix.data; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; -import java.util.concurrent.Callable; -import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardDataHelper.*; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dBackwardFilterHelper.*; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNNConv2dHelper.*; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNNPoolingBackwardHelper.*; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNNPoolingHelper.*; -import org.apache.sysml.runtime.instructions.InstructionUtils; -import org.apache.sysml.runtime.util.ConvolutionUtils; import org.apache.sysml.utils.NativeHelper; -import org.apache.sysml.utils.Statistics; -public class LibMatrixDNNHelper { - +public class LibMatrixDNNHelper +{ protected static class CellIndex3 { public int ix1; public int ix2; @@ -48,243 +35,6 @@ public class LibMatrixDNNHelper { } } - // *********************************** low-level runtime operator selection *********************************************** - // *********************************** based on runtime properties (sparsity, native, etc) ******************************** - // These methods help reduce branch miss predictions and instruction-cache misses. - // Also, they simplify the design of LibMatrixDNN and help in code-maintenance. - - /** - * Factory method that returns list of callable tasks for performing maxpooling operation - * - * @param params convolution parameters - * @return list of callable tasks for performing maxpooling operation - * @throws DMLRuntimeException if error occurs - */ - public static ArrayList<Callable<Long>> getMaxPoolingWorkers(ConvolutionParameters params) throws DMLRuntimeException { - ArrayList<Callable<Long>> ret = new ArrayList<>(); - int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); - int taskSize = (int)(Math.ceil((double)params.N / k)); - for(int i = 0; i*taskSize < params.N; i++) { - if(params.input1.isInSparseFormat()) - ret.add(new SparseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else - ret.add(new DenseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - } - return ret; - } - - /** - * Factory method that returns list of callable tasks for performing maxpooling backward operation - * - * @param params convolution parameters - * @param performReluBackward whether to perform ReLU backward - * @return list of callable tasks for performing maxpooling backward operation - * @throws DMLRuntimeException if error occurs - */ - public static ArrayList<Callable<Long>> getMaxPoolingBackwardWorkers(ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { - ArrayList<Callable<Long>> ret = new ArrayList<>(); - int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); - int taskSize = (int)(Math.ceil((double)params.N / k)); - boolean sparse1 = params.input1.isInSparseFormat(); - boolean sparse2 = params.input2.isInSparseFormat(); - for(int i = 0; i*taskSize < params.N; i++) { - if( !sparse1 && !sparse2 ) - ret.add(new PoolingBackwardDenseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); - else if( !sparse1 && sparse2 ) - ret.add(new PoolingBackwardDenseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); - else if( sparse1 && !sparse2 ) - ret.add(new PoolingBackwardSparseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); - else if( sparse1 && sparse2 ) - ret.add(new PoolingBackwardSparseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); - } - return ret; - } - - /** - * Factory method that returns list of callable tasks for performing relu backward operation - * - * @param params convolution parameters - * @return list of callable tasks for performing relu backward operation - * @throws DMLRuntimeException if error occurs - */ - public static ArrayList<Callable<Long>> getReluBackwardWorkers(ConvolutionParameters params) throws DMLRuntimeException { - ArrayList<Callable<Long>> ret = new ArrayList<>(); - int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); - int taskSize = (int)(Math.ceil((double)params.N / k)); - for(int i = 0; i*taskSize < params.N; i++) { - ret.add(new ReluBackward(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - } - return ret; - } - - /** - * Factory method that returns list of callable tasks for performing conv2d - * - * @param params convolution parameters - * @return list of callable tasks for performing conv2d - * @throws DMLRuntimeException if error occurs - */ - public static ArrayList<Callable<Long>> getConv2dWorkers(ConvolutionParameters params) throws DMLRuntimeException { - ArrayList<Callable<Long>> ret = new ArrayList<>(); - - // Try to create twice as many tasks as threads for improved load balance - // (due to constant-sized intermediates, GC works well, so the overhead per task is small) - int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); - int taskSize = (int)(Math.ceil((double)params.N / k / 2)); - - // TODO: Decide here based on params whether to use LoopedIm2ColConv2dAllChannels or LoopedIm2ColConv2dOneChannel - // For now, let's stick to the existing approach of converting [1, CHW] to [CRS, PQ] as it allows matrix multiplication large enough matrix. - boolean allChannels = true; ArrayList<MatrixBlock> filters = null; - if(!allChannels) - filters = splitFilter(params); - - MatrixBlock in1 = params.input1; - boolean isEmptyDenseInput = !in1.isInSparseFormat() && in1.denseBlock == null; - boolean isTransPref = in1.sparse && !params.input2.sparse && - MatrixBlock.evalSparseFormatInMemory(in1.clen, in1.rlen, in1.nonZeros); - boolean applyNative = LibMatrixDNN.isEligibleForConv2dSparse(params) - && !(!isEmptyDenseInput && allChannels && isTransPref); - if( applyNative ) - Statistics.numNativeSparseConv2dCalls.increment(); - - //transpose filter once for efficient sparse-dense multiplies in LoopedIm2ColConv2dTransAllChan - //in order to share the temporary object and its creation costs across threads - if( !applyNative && !isEmptyDenseInput && allChannels && isTransPref ) { - params.input2 = LibMatrixReorg.transpose(params.input2, - new MatrixBlock(params.input2.clen, params.input2.rlen, false), k); - } - - for(int i = 0; i*taskSize < params.N; i++) { - //note: we prefer the java backend for sparse inputs because the native - //implementation simply converts the sparse input into dense rows - if( applyNative ) - ret.add(new SparseNativeConv2d(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else if(!isEmptyDenseInput && allChannels && isTransPref) - ret.add(new LoopedIm2ColConv2dTransAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else if(!isEmptyDenseInput && allChannels) - ret.add(new LoopedIm2ColConv2dAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else if(!isEmptyDenseInput && !allChannels) - ret.add(new LoopedIm2ColConv2dOneChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params, filters)); - else - throw new DMLRuntimeException("Unsupported operator"); - } - return ret; - } - - /** - * Factory method that returns list of callable tasks for performing conv2d backward filter - * - * @param params convolution parameters - * @return list of callable tasks for performing conv2d backward filter - * @throws DMLRuntimeException if error occurs - */ - public static ArrayList<Callable<Long>> getConv2dBackwardFilterWorkers(ConvolutionParameters params) throws DMLRuntimeException { - ArrayList<Callable<Long>> ret = new ArrayList<>(); - // Try to create as many tasks as threads. - // Creating more tasks will help in tail, but would have additional overhead of maintaining the intermediate - // data structures such as im2col blocks. - int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); - int taskSize = (int)(Math.ceil((double)params.N / k)); - - boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || - (!params.input2.isInSparseFormat() && params.input2.denseBlock == null); - boolean applyNative = LibMatrixDNN.isEligibleForConv2dBackwardFilterSparseDense(params) - && !params.input2.isInSparseFormat(); - if( applyNative ) - Statistics.numNativeSparseConv2dBwdFilterCalls.increment(); - - for(int i = 0; i*taskSize < params.N; i++) { - //note: we prefer the java backend for sparse filters because the native - //implementation simply rotates the sparse filters into dense rows - if( applyNative ) - ret.add(new SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else if( params.input2.sparse && params.input1.getSparsity() > params.input2.getSparsity() ) - ret.add(new Conv2dBackwardFilterTrans(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else if(!isEmptyDenseInput) - ret.add(new Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else - throw new DMLRuntimeException("Unsupported operator"); - } - return ret; - } - - /** - * Factory method that returns list of callable tasks for performing conv2d backward data - * - * @param params convolution parameters - * @return list of callable tasks for performing conv2d backward data - * @throws DMLRuntimeException if error occurs - */ - public static ArrayList<Callable<Long>> getConv2dBackwardDataWorkers(ConvolutionParameters params) throws DMLRuntimeException { - ArrayList<Callable<Long>> ret = new ArrayList<>(); - - // Try to create as many tasks as threads. - // Creating more tasks will help in tail, but would have additional overhead of maintaining the intermediate - // data structures such as im2col blocks. - int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); - int taskSize = (int)(Math.ceil((double)params.N / k)); - - boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || - (!params.input2.isInSparseFormat() && params.input2.denseBlock == null); - boolean applyNative = LibMatrixDNN.isEligibleForConv2dBackwardDataDense(params) - && !params.input2.isInSparseFormat(); - if( applyNative ) - Statistics.numNativeSparseConv2dBwdDataCalls.increment(); - - for(int i = 0; i*taskSize < params.N; i++) { - //note: we prefer the java backend for sparse filters because the native - //implementation simply converts the sparse filters into dense rows - if( applyNative ) - ret.add(new SparseNativeConv2dBackwardDataDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else if(!isEmptyDenseInput) - ret.add(new Conv2dBackwardData(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); - else - throw new DMLRuntimeException("Unsupported operator"); - } - - return ret; - } - - // *********************************** relu backward operator ****************************************************** - - /** - * Performs the operation: (X gt 0) * dout - */ - public static class ReluBackward implements Callable<Long> - { - public int _rl; public int _ru; - private final ConvolutionParameters _params; - double [] outputArray; int numOutCols; - public ReluBackward(int rl, int ru, ConvolutionParameters params) { - _rl = rl; _ru = ru; - _params = params; - outputArray= params.output.getDenseBlockValues(); - numOutCols = params.input1.getNumColumns(); - } - - @Override - public Long call() throws Exception { - if(!_params.input1.isInSparseFormat() && !_params.input2.isInSparseFormat()) { - double [] inputArr = _params.input1.getDenseBlockValues(); - double [] doutArr = _params.input2.getDenseBlockValues(); - for(int i = _rl*numOutCols; i < _ru*numOutCols; i++) { - outputArray[i] = inputArr[i] > 0 ? doutArr[i] : 0; - } - } - else { - // Perform (X > 0) - ConvolutionUtils.scalarOperations(_params.input1, outputArray, _rl*numOutCols, numOutCols, _rl, _ru, - InstructionUtils.parseScalarBinaryOperator(">", false, 0)); - // Then perform (X > 0) * dout - ConvolutionUtils.binaryOperationInPlace(_params.input2, outputArray, _rl*numOutCols, numOutCols, _rl, _ru, - LibMatrixDNN._binaryElementWiseMultiplication); - } - return 0L; - } - } - - // *********************************** utility methods ****************************************************** - protected static CellIndex3 computeTensorIndexes(int j, int H, int W) { return computeTensorIndexes(j, H, W, new CellIndex3()); } @@ -307,54 +57,8 @@ public class LibMatrixDNNHelper { return ret; } - //Split a filter of size [K, CRS] into c filters of [K, RS] - private static ArrayList<MatrixBlock> splitFilter(ConvolutionParameters _params) { - ArrayList<MatrixBlock> ret = new ArrayList<>(); - int RS = _params.R*_params.S; int CRS = _params.C*_params.R*_params.S; - double [] filter = _params.input2.getDenseBlockValues(); int S = _params.S; - for(int c = 0; c < _params.C; c++) { - MatrixBlock mb = new MatrixBlock(_params.K, RS, false); - mb.allocateDenseBlock(); long nnz = 0; - double [] outputArr = mb.getDenseBlockValues(); - if(filter != null) { - for(int k = 0; k < _params.K; k++) { - int outOffset = k*RS; - int filterOffset = k*CRS + c*RS; - for(int rs = 0; rs < RS; rs++) { - int outIndex = outOffset + rs; - outputArr[outIndex] = filter[filterOffset + rs]; - nnz += outputArr[outIndex] != 0 ? 1 : 0; - } - } - } - else { - SparseBlock sblock = _params.input2.sparseBlock; - CellIndex3 ix = new CellIndex3(); - for(int k = 0; k < _params.K; k++) { - if( sblock.isEmpty(k) ) continue; - // Find maxIndex - int apos = sblock.pos(k); - int alen = sblock.size(k); - int[] aix = sblock.indexes(k); - double[] avals = sblock.values(k); - for(int j=apos; j<apos+alen; j++) { - ix = computeTensorIndexes(aix[j], _params.R, _params.S, ix); - if(c != ix.ix1) - continue; - outputArr[k*RS + ix.ix2*S + ix.ix3] = avals[j]; - nnz += outputArr[k*RS + ix.ix2*S + ix.ix3] != 0 ? 1 : 0; - } - } - } - mb.setNonZeros(nnz); - ret.add(mb); - } - return ret; - } - - // Single-threaded matrix multiplication - static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, - boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException { + protected static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, + boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException { if( !params.enableNative || m1.sparse || m2.sparse ) { prepNonZerosForMatrixMult(m1, recomputeNNZM1); prepNonZerosForMatrixMult(m2, recomputeNNZM2); @@ -372,228 +76,6 @@ public class LibMatrixDNNHelper { ret.setNonZeros((long)ret.rlen*ret.clen); } - static void addBias(int r, double [] out, double [] bias, int K, int PQ) { - for(int k=0, cix=r*K*PQ; k<K; k++, cix+=PQ) - LibMatrixMult.vectAddInPlace(bias[k], out, cix, PQ); - } - - /** - * Returns the index of cell with maximum value. This method is optimized for dense input - * - * @param p output feature map height - * @param q output feature map width - * @param inputOffset offset to be used for input index - * @param inputArray input array - * @param params convolution parameters - * @param performReluBackward perform ReLU backward - * @return index of cell with maximum value - */ - static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params, boolean performReluBackward) { - int start_index_h = params.start_indexes_h[p]; - int end_index_h = params.end_indexes_h[p]; - int start_index_w = params.start_indexes_w[q]; - int end_index_w = params.end_indexes_w[q]; - - int maxIndex = -1; - double maxVal = -Double.MAX_VALUE; - - // Note: We do not treat pad as zero and hence we don't do: - // maxVal = 0 - // if start_index_h < 0 || start_index_w < 0 || end_index_h >= params.H || end_index_w >= params.W - - // Find maxIndex - double currDoutVal = -1; - for (int h = start_index_h; h < end_index_h; h++) { - for (int w = start_index_w; w < end_index_w; w++) { - currDoutVal = inputArray[inputOffset + h*params.W + w]; - currDoutVal = performReluBackward && currDoutVal < 0 ? 0 : currDoutVal; - if(maxVal < currDoutVal) { - maxIndex = inputOffset + h*params.W + w; - maxVal = currDoutVal; - } - } - } - return maxIndex; - } - - /** - * Returns the index of cell with maximum value. This method is optimized for sparse input - * - * @param p output feature map height - * @param q output feature map width - * @param inputOffset offset to be used for input index - * @param n number of images - * @param c number of channels - * @param input input matrix - * @param params convolution parameters - * @param performReluBackward perform ReLU on input - * @return index of the cell with maximum value - * @throws DMLRuntimeException if error occurs - */ - static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { - int start_h = params.start_indexes_h[p]; - int end_h = params.end_indexes_h[p]; - int start_w = params.start_indexes_w[q]; - int end_w = params.end_indexes_w[q]; - int numVals = (end_h-start_h)*(end_w-start_w); - BitSet bmap = new BitSet(numVals); - - int maxIndex = -1; - double maxVal = -Double.MAX_VALUE; - - // Note: We do not treat pad as zero and hence we don't do: - // maxVal = 0 - // if start_index_h < 0 || start_index_w < 0 || end_index_h >= params.H || end_index_w >= params.W - - // input.isEmptyBlock() check is done by the caller - CellIndex3 ix = new CellIndex3(); - SparseBlock sblock = input.sparseBlock; - if( !sblock.isEmpty(n) ) { - // Find maxIndex - int apos = sblock.pos(n); - int alen = sblock.size(n); - int[] aix = sblock.indexes(n); - double[] avals = sblock.values(n); - for(int j=apos; j<apos+alen; j++) { - ix = computeTensorIndexes(aix[j], params.H, params.W, ix); - if(c != ix.ix1) continue; - int h = ix.ix2; - int w = ix.ix3; - if(h >= start_h && h < end_h && w >= start_w && w < end_w) { - double val = performReluBackward && avals[j] < 0 ? 0 : avals[j]; - if(maxVal < val) { - maxIndex = inputOffset + h*params.W + w; - maxVal = val; - } - bmap.set((h-start_h)*(end_w-start_w) + (w-start_w)); - } - } - } - - //correction for 0 maximum and subset of evaluated entries - int firstMiss = bmap.nextClearBit(0); - if( firstMiss < numVals && maxVal<0 ) { - int lh = firstMiss/(end_w-start_w)+start_h; - int lw = firstMiss%(end_w-start_w)+start_w; - maxIndex = inputOffset + lh * params.W + lw; - maxVal = 0; - } - return maxIndex; - } - - // Returns the row of matrix in dense format - static void getRowInDenseFormat(MatrixBlock input, int n, double [] ret) throws DMLRuntimeException { - if(input.getNumColumns() != ret.length) { - throw new DMLRuntimeException("Invalid parameters"); - } - // Use temporary array to avoid binary search - if(input.isInSparseFormat()) { - Arrays.fill(ret, 0); - if( !input.sparseBlock.isEmpty(n) ) { - int apos = input.sparseBlock.pos(n); - int alen = input.sparseBlock.size(n); - int[] aix = input.sparseBlock.indexes(n); - double[] avals = input.sparseBlock.values(n); - for(int j=apos; j<apos+alen; j++) - ret[ aix[j] ] = avals[j]; - } - } - else { - System.arraycopy(input.getDenseBlockValues(), - n*input.getNumColumns(), ret, 0, input.getNumColumns()); - } - } - - // ------------------------------------------------------------------------------------------------------ - // Since col2im always operates on intermediate generated as part of matmult, it is not clear which operator to select apriori. - // Therefore, it is provided as utility function rather than an operator (like im2col or rotate180) - - //Converts input: PQ X CRS matrix and writes to 1 X CHW - static void doCol2imOverSingleImage(int outputN, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException { - if(input.rlen != params.P*params.Q || input.clen != params.C*params.R*params.S) { - throw new DMLRuntimeException("Incorrect input dimensions"); - } - - double [] outputArray = null; - if (!params.output.isInSparseFormat()) - outputArray = params.output.getDenseBlockValues(); - else { - throw new DMLRuntimeException("Only dense output is implemented"); - } - - if(!input.isInSparseFormat()) { - double [] inputArray = input.getDenseBlockValues(); - doCol2IMDenseInput(0, outputN, inputArray, outputArray, params); - } - else { - if(!input.isEmptyBlock()) { - int outOffset = outputN*params.C*params.H*params.W; - int HW = params.H*params.W; - CellIndex3 ix = new CellIndex3(); - SparseBlock sblock = input.sparseBlock; - for(int i = 0; i < input.getNumRows(); i++) { - if( sblock.isEmpty(i) ) continue; - ix = computeTensorIndexes(i, params.P, params.Q, ix); - int tmpP = ix.ix2*params.stride_h - params.pad_h; - int tmpQ = ix.ix3*params.stride_w - params.pad_w; - if(ix.ix1 != 0) - throw new DMLRuntimeException("Incorrect tensor indexes: "+ ix + ", " + params.P + " " + params.Q); - int apos = sblock.pos(i); - int alen = sblock.size(i); - int[] aix = sblock.indexes(i); - double[] avals = sblock.values(i); - for(int j = apos; j < apos+alen; j++) { - ix = computeTensorIndexes(aix[j], params.R, params.S, ix); - int h = tmpP + ix.ix2; - int w = tmpQ + ix.ix3; - if(h >= 0 && h < params.H && w >= 0 && w < params.W) { - int outIndex = outOffset + ix.ix1*HW + h*params.W + w; - outputArray[outIndex] += avals[j]; - } - } - } - } - } - } - - // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0 - // Or converts input: NPQ X CRS matrix and writes to N X CHW - private static void doCol2IMDenseInput(int inputN, int outputN, double [] inputArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { - final int outputNOffset = outputN*params.C*params.H*params.W; - final int HW = params.H*params.W; - final int inputNPQ = inputN*params.P*params.Q; - final int CRS = params.C*params.R*params.S; - final int RS = params.R*params.S; - for (int p = 0; p < params.P; p++) { - // h = p*params.stride_h + r - params.pad_h - // = r + hOffset - // Based on restrictions: h >= 0 and r >= 0 and h < params.H and r < params.R, we get - // max(0, - hOffset) <= r < min(params.R, params.H - hOffset) - final int hOffset = p*params.stride_h - params.pad_h; - final int rStart = Math.max(0, - hOffset); - final int rEnd = Math.min(params.R, params.H - hOffset); - for (int q = 0; q < params.Q; q++) { - // Using the same logic as above on following: - // w = q*params.stride_w + s - params.pad_w - final int wOffset = q*params.stride_w - params.pad_w; - final int sStart = Math.max(0, - wOffset); - final int sEnd = Math.min(params.S, params.W - wOffset); - final int tempOffset = (inputNPQ + p*params.Q + q)*CRS; - for (int c = 0; c < params.C; c++) { - final int outOffset = outputNOffset + c*HW; - final int inputOffset = tempOffset + c*RS; - for (int r = rStart; r < rEnd; r++) { - for (int s = sStart; s < sEnd; s++) { - int inputIndex = inputOffset + r*params.S + s; - int outIndex = outOffset + (hOffset + r)*params.W + wOffset + s; - outputArray[outIndex] += inputArray[inputIndex]; - } - } - } - } - } - } - private static void prepNonZerosForMatrixMult(MatrixBlock mb, boolean update) { if( !update ) return;
http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2Col.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2Col.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2Col.java new file mode 100644 index 0000000..ac7a6a8 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2Col.java @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.Arrays; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.matrix.data.LibMatrixDNNHelper.CellIndex3; + +/** + * This class contains the different implementation of im2col operation + */ +public class LibMatrixDNNIm2Col { + private static final Log LOG = LogFactory.getLog(LibMatrixDNNIm2Col.class.getName()); + static interface Im2colWorker { + public void execute(int n); + public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock out, ConvolutionParameters params, boolean trans) { + if(!input.isInSparseFormat()) { + boolean stride1Pad0 = params.stride_h == 1 && params.stride_w == 1 + && params.pad_h == 0 && params.pad_w == 0; + // Note: Only dense im2col operators require the im2ColOutBlock to be allocated in the dense format. + out.reset(out.rlen, out.clen, false); + out.allocateDenseBlock(); + if( LOG.isTraceEnabled() ) + LOG.trace("Using DenseIm2colWorkerAllChannels operator to perform im2col (stride1pad0="+stride1Pad0+")."); + if( stride1Pad0 && !trans ) + return new DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlockValues(), out.getDenseBlockValues(), params); + else + return new DenseIm2colWorkerAllChannels(input.getDenseBlockValues(), out.getDenseBlockValues(), params, trans); + } + else { + if(LOG.isTraceEnabled()) + LOG.trace("Using SparseIm2colWorker operator to perform im2col."); + out.reset(out.rlen, out.clen, true); + out.allocateSparseRowsBlock(); + //preallocate sparse-rows (larger than average sparsity to account for skew) + int estnnz = (int)Math.ceil(4*input.getSparsity()*out.clen); + for(int r = 0; r < out.rlen; r++) + out.getSparseBlock().allocate(r, Math.min(estnnz, out.clen)); + return new SparseSparseIm2colWorkerAllChan(input, out, params, trans); + } + } + } + + /** + * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy + */ + private static class DenseIm2colWorkerStride1Pad0AllChannels implements Im2colWorker { + private final double [] inputArray, outputArray; + private final int CRS, S, R, P, Q, CHW, H, W; + public DenseIm2colWorkerStride1Pad0AllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) { + this.inputArray = inputArray; + this.outputArray = outputArray; + this.CRS = params.C * params.R * params.S; + this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.CHW = params.C*params.H*params.W; + } + + @Override + public void execute(int n) { + int nOffset = n * CHW; + for (int c = 0; c < CRS; ++c) { + int wOffset = c % S; + int hOffset = (c / S) % R; + int cInput = c / R / S; + for (int h = 0; h < P; ++h) { + int hPadded = h + hOffset; + int outOffset = (c * P + h) * Q; + int inputOffset = nOffset + (cInput * H + hPadded) * W; + System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q); + int w = Q - 1; + int wPadded = w + wOffset; + boolean assign = (hPadded < H && wPadded < W); + outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0; + } + } + } + } + + /** + * Performing dense im2col (general case) + */ + private static class DenseIm2colWorkerAllChannels implements Im2colWorker { + private final double[] inputArray, outputArray; + private final int CRS, S, R, P, Q, CHW, H, W; + private final int stride_h, stride_w, pad_h, pad_w; + private final boolean trans; + public DenseIm2colWorkerAllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params, boolean trans) { + this.inputArray = inputArray; + this.outputArray = outputArray; + this.CRS = params.C * params.R * params.S; + this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.CHW = params.C*params.H*params.W; + this.stride_h = params.stride_h; this.stride_w = params.stride_w; + this.pad_h = params.pad_h; this.pad_w = params.pad_w; + this.trans = trans; + } + + @Override + public void execute(int n) { + //reset for selective copy + Arrays.fill(outputArray, 0); + + int nOffset = n * CHW; + for (int c = 0; c < CRS; ++c) { + int wOffset = c % S; + int hOffset = (c / S) % R; + int cInput = c / R / S; + for (int h = 0; h < P; ++h) { + int outOffset = trans ? c+(h*Q*CRS) : (c*P+h)*Q; + int hPadded = h * stride_h - pad_h + hOffset; + int inputOffset = nOffset + (cInput * H + hPadded) * W; + if (hPadded < 0 || hPadded >= H ) continue; + for (int w = 0; w < Q; ++w) { + int wPadded = w * stride_w - pad_w + wOffset; + if( wPadded >= 0 && wPadded < W ) + outputArray[outOffset + (trans?w*CRS:w)] + = inputArray[inputOffset + wPadded]; + } + } + } + } + } + + /** + * Performing sparse im2col for all channels for a given row n of the input matrix. + */ + private static class SparseSparseIm2colWorkerAllChan implements Im2colWorker { + private final MatrixBlock input, output; + private final int S, R, P, Q, W, HW, RS; + private final int stride_h, stride_w, pad_h, pad_w; + private final boolean trans; + private final boolean simple; + public SparseSparseIm2colWorkerAllChan(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) { + this.input = input; + this.output = im2ColOutBlock; + this.HW = params.H * params.W; + this.RS = params.R * params.S; + this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; + this.stride_h = params.stride_h; this.stride_w = params.stride_w; + this.pad_h = params.pad_h; this.pad_w = params.pad_w; + this.trans = trans; + this.simple = params.isStride1Pad0() && W == S && Q == 1; + if(!input.isInSparseFormat()) + throw new RuntimeException("Incorrect operator selection. Expected dense input for SparseIm2colWorkerAllChannels"); + } + + @Override + public void execute(int n) { + output.reset(); + SparseBlock sblock = input.sparseBlock; + if( sblock.isEmpty(n) ) + return; + int apos = sblock.pos(n); + int alen = sblock.size(n); + int[] aix = sblock.indexes(n); + double[] avals = sblock.values(n); + + // Iterate over the sparse block + for(int j=apos; j<apos+alen; j++) { + // Note: the input is of shape [N, CHW] + int chw = aix[j]; + + // Get individual zero-based c,h,w indexes from zero-based 'chw' + int cInput = chw / HW; + int hInput = (chw - cInput*HW)/W; + int wInput = chw % W; + + if( simple ) + appendInputValueToIm2colOutputSimple(output, cInput, hInput, wInput, + avals[j], R, S, RS, P, trans); + else + appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], + R, S, P, Q, stride_h, stride_w, pad_h, pad_w, trans); + } + + output.sortSparseRows(); + } + } + + /** + * Appends the value corresponding to the given [, cInput, hInput, wInput] to the appropriate im2col location of the output + * + * @param output output matrix block + * @param cInput input channel index (zero-based) + * @param hInput input height index (zero-based) + * @param wInput input width index (zero-based) + * @param value input value + * @param R filter height + * @param S filter width + * @param P output height + * @param Q output width + * @param stride_h stride height + * @param stride_w stride width + * @param pad_h pad height + * @param pad_w pad width + */ + private static void appendInputValueToIm2colOutput(MatrixBlock output, int cInput, int hInput, int wInput, double value, + int R, int S, int P, int Q, int stride_h, int stride_w, int pad_h, int pad_w, boolean trans) { + int RS = R*S; + // For the given h,w index, insert avals[j] into respective r,s,p,q locations + + // Constraints: for(int r = 0; r < R; r++) { if(0 <= p && p < P && (hInput - r + pad_h) % stride_h == 0) { ... } } + // Constraint 1: p >= 0 and p = (hInput - r + pad_h) / stride_h + // Therefore, r <= hInput + pad_h + // Constraint 2: p < P and p = (hInput - r + pad_h) / stride_h + // Therefore, hInput + pad_h - P*stride_h < r + // Math.max(0, hInput + pad_h - P*stride_h + 1) <= r <= Math.min(R-1, hInput + pad_h) + int rMin = Math.max(0, hInput + pad_h - P*stride_h + 1); + int rMax = Math.min(R-1, hInput + pad_h); + int sMin = Math.max(0, wInput + pad_w - Q*stride_w + 1); + int sMax = Math.min(S-1, wInput + pad_w); + // Constraint 3: (hInput - r + pad_h) % stride_h == 0 + while((hInput - rMin + pad_h) % stride_h != 0 && rMin <= rMax) rMin++; + while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) sMin++; + + for(int r = rMin; r <= rMax; r += stride_h) { + // Only append value if h == hInput, where h = (r - pad_h) + p*stride_h and 0 <= p < P + // Therefore, p = (hInput - r + pad_h) / stride_h. Use the same logic for q. + final int p = (hInput - r + pad_h) / stride_h; + final int pQ = p*Q; + final int outRowIndex = cInput*RS + r*S; + for(int s = sMin; s <= sMax; s += stride_w) { + int q = (wInput - s + pad_w) / stride_w; + // chw -> [crs, pq] + if( trans ) + output.appendValue(pQ + q, outRowIndex + s, value); + else + output.appendValue(outRowIndex + s, pQ + q, value); + } + } + } + + private static void appendInputValueToIm2colOutputSimple(MatrixBlock output, int c, int h, int w, + double value, int R, int S, int RS, int P, boolean trans) { + int rMin = Math.max(0, h - P + 1); + int rMax = Math.min(R-1, h); + int cix = c*RS+w+rMin*S; + for(int p=h-rMin; p >= h-rMax; p--, cix+=S) + output.appendValue(trans?p:cix, trans?cix:p, value); + } + + + // ------------------------------------------------------------------------------------------------------ + // Since col2im always operates on intermediate generated as part of matmult, it is not clear which operator to select apriori. + // Therefore, it is provided as utility function rather than an operator (like im2col or rotate180) + + //Converts input: PQ X CRS matrix and writes to 1 X CHW + static void doCol2imOverSingleImage(int outputN, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException { + if(input.rlen != params.P*params.Q || input.clen != params.C*params.R*params.S) { + throw new DMLRuntimeException("Incorrect input dimensions"); + } + + double [] outputArray = null; + if (!params.output.isInSparseFormat()) + outputArray = params.output.getDenseBlockValues(); + else { + throw new DMLRuntimeException("Only dense output is implemented"); + } + + if(!input.isInSparseFormat()) { + double [] inputArray = input.getDenseBlockValues(); + doCol2IMDenseInput(0, outputN, inputArray, outputArray, params); + } + else { + if(!input.isEmptyBlock()) { + int outOffset = outputN*params.C*params.H*params.W; + int HW = params.H*params.W; + CellIndex3 ix = new CellIndex3(); + SparseBlock sblock = input.sparseBlock; + for(int i = 0; i < input.getNumRows(); i++) { + if( sblock.isEmpty(i) ) continue; + ix = LibMatrixDNNHelper.computeTensorIndexes(i, params.P, params.Q, ix); + int tmpP = ix.ix2*params.stride_h - params.pad_h; + int tmpQ = ix.ix3*params.stride_w - params.pad_w; + if(ix.ix1 != 0) + throw new DMLRuntimeException("Incorrect tensor indexes: "+ ix + ", " + params.P + " " + params.Q); + int apos = sblock.pos(i); + int alen = sblock.size(i); + int[] aix = sblock.indexes(i); + double[] avals = sblock.values(i); + for(int j = apos; j < apos+alen; j++) { + ix = LibMatrixDNNHelper.computeTensorIndexes(aix[j], params.R, params.S, ix); + int h = tmpP + ix.ix2; + int w = tmpQ + ix.ix3; + if(h >= 0 && h < params.H && w >= 0 && w < params.W) { + int outIndex = outOffset + ix.ix1*HW + h*params.W + w; + outputArray[outIndex] += avals[j]; + } + } + } + } + } + } + + // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0 + // Or converts input: NPQ X CRS matrix and writes to N X CHW + private static void doCol2IMDenseInput(int inputN, int outputN, double [] inputArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { + final int outputNOffset = outputN*params.C*params.H*params.W; + final int HW = params.H*params.W; + final int inputNPQ = inputN*params.P*params.Q; + final int CRS = params.C*params.R*params.S; + final int RS = params.R*params.S; + for (int p = 0; p < params.P; p++) { + // h = p*params.stride_h + r - params.pad_h + // = r + hOffset + // Based on restrictions: h >= 0 and r >= 0 and h < params.H and r < params.R, we get + // max(0, - hOffset) <= r < min(params.R, params.H - hOffset) + final int hOffset = p*params.stride_h - params.pad_h; + final int rStart = Math.max(0, - hOffset); + final int rEnd = Math.min(params.R, params.H - hOffset); + for (int q = 0; q < params.Q; q++) { + // Using the same logic as above on following: + // w = q*params.stride_w + s - params.pad_w + final int wOffset = q*params.stride_w - params.pad_w; + final int sStart = Math.max(0, - wOffset); + final int sEnd = Math.min(params.S, params.W - wOffset); + final int tempOffset = (inputNPQ + p*params.Q + q)*CRS; + for (int c = 0; c < params.C; c++) { + final int outOffset = outputNOffset + c*HW; + final int inputOffset = tempOffset + c*RS; + for (int r = rStart; r < rEnd; r++) { + for (int s = sStart; s < sEnd; s++) { + int inputIndex = inputOffset + r*params.S + s; + int outIndex = outOffset + (hOffset + r)*params.W + wOffset + s; + outputArray[outIndex] += inputArray[inputIndex]; + } + } + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java deleted file mode 100644 index bd29848..0000000 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNIm2ColHelper.java +++ /dev/null @@ -1,419 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.matrix.data; - -import java.util.Arrays; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * This class contains the different implementation of im2col operation - */ -public class LibMatrixDNNIm2ColHelper { - private static final Log LOG = LogFactory.getLog(LibMatrixDNNIm2ColHelper.class.getName()); - static interface Im2colWorker { - public void execute(int n); - public void execute(int n, int c); - public static Im2colWorker getWorker(MatrixBlock input, MatrixBlock out, ConvolutionParameters params, boolean allChannels, boolean trans) { - if(!input.isInSparseFormat()) { - boolean stride1Pad0 = params.stride_h == 1 && params.stride_w == 1 - && params.pad_h == 0 && params.pad_w == 0; - // Note: Only dense im2col operators require the im2ColOutBlock to be allocated in the dense format. - out.reset(out.rlen, out.clen, false); - out.allocateDenseBlock(); - if( LOG.isTraceEnabled() ) - LOG.trace("Using DenseIm2colWorkerAllChannels operator to perform " - + "im2col (stride1pad0="+stride1Pad0+", allChannels="+allChannels+")."); - if(allChannels && stride1Pad0 && !trans ) - return new DenseIm2colWorkerStride1Pad0AllChannels(input.getDenseBlockValues(), out.getDenseBlockValues(), params); - else if( allChannels ) - return new DenseIm2colWorkerAllChannels(input.getDenseBlockValues(), out.getDenseBlockValues(), params, trans); - else if( stride1Pad0 ) - return new DenseIm2colWorkerStride1Pad0(input.getDenseBlockValues(), out.getDenseBlockValues(), params); - else - return new DenseIm2colWorker(input.getDenseBlockValues(), out.getDenseBlockValues(), params); - } - else { - if(LOG.isTraceEnabled()) - LOG.trace("Using SparseIm2colWorker operator to perform im2col."); - out.reset(out.rlen, out.clen, true); - out.allocateSparseRowsBlock(); - //preallocate sparse-rows (larger than average sparsity to account for skew) - int estnnz = (int)Math.ceil(4*input.getSparsity()*out.clen); - for(int r = 0; r < out.rlen; r++) - out.getSparseBlock().allocate(r, Math.min(estnnz, out.clen)); - return new SparseSparseIm2colWorkerAllChan(input, out, params, trans); - } - } - } - - /** - * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy - */ - static class DenseIm2colWorkerStride1Pad0 implements Im2colWorker { - private final double [] inputArray, outputArray; - private final int S, R, P, Q, CHW, H, W; - public DenseIm2colWorkerStride1Pad0(double [] inputArray, double [] outputArray, ConvolutionParameters params) { - this.inputArray = inputArray; - this.outputArray = outputArray; - this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; - this.CHW = params.C*params.H*params.W; - } - - @Override - public void execute(int n) { - throw new RuntimeException("Not supported"); - } - - @Override - public void execute(int n, int cInput) { - int nOffset = n * CHW; - int RS = R*S; - for (int rs = 0; rs < RS; ++rs) { - int wOffset = rs % S; - int hOffset = rs / S; - for (int h = 0; h < P; ++h) { - int hPadded = h + hOffset; - int outOffset = (rs * P + h) * Q; - int inputOffset = nOffset + (cInput * H + hPadded) * W; - System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q); - int w = Q - 1; - int wPadded = w + wOffset; - boolean assign = (hPadded < H && wPadded < W); - outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0; - } - } - } - } - - - - /** - * Special case operator for performing dense im2col when stride = [1, 1] and pad = [0, 0] by using System.arraycopy - */ - static class DenseIm2colWorkerStride1Pad0AllChannels implements Im2colWorker { - private final double [] inputArray, outputArray; - private final int CRS, S, R, P, Q, CHW, H, W; - public DenseIm2colWorkerStride1Pad0AllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params) { - this.inputArray = inputArray; - this.outputArray = outputArray; - this.CRS = params.C * params.R * params.S; - this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; - this.CHW = params.C*params.H*params.W; - } - - @Override - public void execute(int n, int c) { - throw new RuntimeException("Not supported"); - } - - @Override - public void execute(int n) { - int nOffset = n * CHW; - for (int c = 0; c < CRS; ++c) { - int wOffset = c % S; - int hOffset = (c / S) % R; - int cInput = c / R / S; - for (int h = 0; h < P; ++h) { - int hPadded = h + hOffset; - int outOffset = (c * P + h) * Q; - int inputOffset = nOffset + (cInput * H + hPadded) * W; - System.arraycopy(inputArray, inputOffset + wOffset, outputArray, outOffset, Q); - int w = Q - 1; - int wPadded = w + wOffset; - boolean assign = (hPadded < H && wPadded < W); - outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0; - } - } - } - } - - /** - * Performing dense im2col (general case) - */ - static class DenseIm2colWorker implements Im2colWorker { - private final double [] inputArray, outputArray; - private final int S, R, P, Q, CHW, H, W; - private final int stride_h, stride_w, pad_h, pad_w; - public DenseIm2colWorker(double [] inputArray, double [] outputArray, ConvolutionParameters params) { - this.inputArray = inputArray; - this.outputArray = outputArray; - this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; - this.CHW = params.C*params.H*params.W; - this.stride_h = params.stride_h; this.stride_w = params.stride_w; - this.pad_h = params.pad_h; this.pad_w = params.pad_w; - } - - @Override - public void execute(int n) { - throw new RuntimeException("Not supported"); - } - - @Override - public void execute(int n, int cInput) { - int nOffset = n * CHW; int RS = R*S; - for (int rs = 0; rs < RS; ++rs) { - int wOffset = rs % S; - int hOffset = rs / S; - for (int h = 0; h < P; ++h) { - int outOffset = (rs * P + h) * Q; - int hPadded = h * stride_h - pad_h + hOffset; - int inputOffset = nOffset + (cInput * H + hPadded) * W; - if (hPadded < 0 || hPadded >= H) { - Arrays.fill(outputArray, outOffset, outOffset+Q, 0); - } else { - for (int w = 0; w < Q; ++w) { - int wPadded = w * stride_w - pad_w + wOffset; - boolean assign = (wPadded >= 0 && wPadded < W); - outputArray[outOffset + w] = assign ? inputArray[inputOffset + wPadded] : 0; - } - } - } - } - } - } - - /** - * Performing dense im2col (general case) - */ - private static class DenseIm2colWorkerAllChannels implements Im2colWorker { - private final double[] inputArray, outputArray; - private final int CRS, S, R, P, Q, CHW, H, W; - private final int stride_h, stride_w, pad_h, pad_w; - private final boolean trans; - public DenseIm2colWorkerAllChannels(double [] inputArray, double [] outputArray, ConvolutionParameters params, boolean trans) { - this.inputArray = inputArray; - this.outputArray = outputArray; - this.CRS = params.C * params.R * params.S; - this.H = params.H; this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; - this.CHW = params.C*params.H*params.W; - this.stride_h = params.stride_h; this.stride_w = params.stride_w; - this.pad_h = params.pad_h; this.pad_w = params.pad_w; - this.trans = trans; - } - - @Override - public void execute(int n, int c) { - throw new RuntimeException("Not supported"); - } - - @Override - public void execute(int n) { - //reset for selective copy - Arrays.fill(outputArray, 0); - - int nOffset = n * CHW; - for (int c = 0; c < CRS; ++c) { - int wOffset = c % S; - int hOffset = (c / S) % R; - int cInput = c / R / S; - for (int h = 0; h < P; ++h) { - int outOffset = trans ? c+(h*Q*CRS) : (c*P+h)*Q; - int hPadded = h * stride_h - pad_h + hOffset; - int inputOffset = nOffset + (cInput * H + hPadded) * W; - if (hPadded < 0 || hPadded >= H ) continue; - for (int w = 0; w < Q; ++w) { - int wPadded = w * stride_w - pad_w + wOffset; - if( wPadded >= 0 && wPadded < W ) - outputArray[outOffset + (trans?w*CRS:w)] - = inputArray[inputOffset + wPadded]; - } - } - } - } - } - - /** - * Performing sparse im2col for all channels for a given row n of the input matrix. - */ - private static class SparseSparseIm2colWorkerAllChan implements Im2colWorker { - private final MatrixBlock input, output; - private final int S, R, P, Q, W, HW, RS; - private final int stride_h, stride_w, pad_h, pad_w; - private final boolean trans; - private final boolean simple; - public SparseSparseIm2colWorkerAllChan(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) { - this.input = input; - this.output = im2ColOutBlock; - this.HW = params.H * params.W; - this.RS = params.R * params.S; - this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; - this.stride_h = params.stride_h; this.stride_w = params.stride_w; - this.pad_h = params.pad_h; this.pad_w = params.pad_w; - this.trans = trans; - this.simple = params.isStride1Pad0() && W == S && Q == 1; - if(!input.isInSparseFormat()) - throw new RuntimeException("Incorrect operator selection. Expected dense input for SparseIm2colWorkerAllChannels"); - } - - @Override - public void execute(int n, int c) { - throw new RuntimeException("Not supported"); - } - - @Override - public void execute(int n) { - output.reset(); - SparseBlock sblock = input.sparseBlock; - if( sblock.isEmpty(n) ) - return; - int apos = sblock.pos(n); - int alen = sblock.size(n); - int[] aix = sblock.indexes(n); - double[] avals = sblock.values(n); - - // Iterate over the sparse block - for(int j=apos; j<apos+alen; j++) { - // Note: the input is of shape [N, CHW] - int chw = aix[j]; - - // Get individual zero-based c,h,w indexes from zero-based 'chw' - int cInput = chw / HW; - int hInput = (chw - cInput*HW)/W; - int wInput = chw % W; - - if( simple ) - appendInputValueToIm2colOutputSimple(output, cInput, hInput, wInput, - avals[j], R, S, RS, P, trans); - else - appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], - R, S, P, Q, stride_h, stride_w, pad_h, pad_w, trans); - } - - output.sortSparseRows(); - } - } - - /** - * Performing sparse im2col for a given channel c and for a given row n of the input matrix. - */ - @SuppressWarnings("unused") - private static class SparseSparseIm2colWorker implements Im2colWorker { - private final MatrixBlock input, output; - private final int S, R, P, Q, W, HW; - private final int stride_h, stride_w, pad_h, pad_w; - private final boolean trans; - - public SparseSparseIm2colWorker(MatrixBlock input, MatrixBlock im2ColOutBlock, ConvolutionParameters params, boolean trans) { - this.input = input; - this.output = im2ColOutBlock; - this.HW = params.H*params.W; - this.W = params.W; this.R = params.R; this.S = params.S; this.P = params.P; this.Q = params.Q; - this.stride_h = params.stride_h; this.stride_w = params.stride_w; - this.pad_h = params.pad_h; this.pad_w = params.pad_w; - this.trans = trans; - } - - @Override - public void execute(int n) { - throw new RuntimeException("Not supported"); - } - - @Override - public void execute(int n, int cInput) { - output.reset(); - SparseBlock sblock = input.sparseBlock; - if( sblock.isEmpty(n) ) - return; - int apos = sblock.pos(n); - int alen = sblock.size(n); - int[] aix = sblock.indexes(n); - double[] avals = sblock.values(n); - - // Iterate over the sparse block - for(int j=apos; j<apos+alen; j++) { - // Note: the input is of shape [N, CHW] - int chw = aix[j]; - - if(cInput == (chw / HW)) { - // Get individual zero-based c,h,w indexes from zero-based 'chw' - int hInput = (chw - cInput*HW)/W; - int wInput = chw % W; - - appendInputValueToIm2colOutput(output, cInput, hInput, wInput, avals[j], - R, S, P, Q, stride_h, stride_w, pad_h, pad_w, trans); - } - } - output.sortSparseRows(); - } - } - - /** - * Appends the value corresponding to the given [, cInput, hInput, wInput] to the appropriate im2col location of the output - * - * @param output output matrix block - * @param cInput input channel index (zero-based) - * @param hInput input height index (zero-based) - * @param wInput input width index (zero-based) - * @param value input value - * @param R filter height - * @param S filter width - * @param P output height - * @param Q output width - * @param stride_h stride height - * @param stride_w stride width - * @param pad_h pad height - * @param pad_w pad width - */ - private static void appendInputValueToIm2colOutput(MatrixBlock output, int cInput, int hInput, int wInput, double value, - int R, int S, int P, int Q, int stride_h, int stride_w, int pad_h, int pad_w, boolean trans) { - int RS = R*S; - // For the given h,w index, insert avals[j] into respective r,s,p,q locations - - // Constraints: for(int r = 0; r < R; r++) { if(0 <= p && p < P && (hInput - r + pad_h) % stride_h == 0) { ... } } - // Constraint 1: p >= 0 and p = (hInput - r + pad_h) / stride_h - // Therefore, r <= hInput + pad_h - // Constraint 2: p < P and p = (hInput - r + pad_h) / stride_h - // Therefore, hInput + pad_h - P*stride_h < r - // Math.max(0, hInput + pad_h - P*stride_h + 1) <= r <= Math.min(R-1, hInput + pad_h) - int rMin = Math.max(0, hInput + pad_h - P*stride_h + 1); - int rMax = Math.min(R-1, hInput + pad_h); - int sMin = Math.max(0, wInput + pad_w - Q*stride_w + 1); - int sMax = Math.min(S-1, wInput + pad_w); - // Constraint 3: (hInput - r + pad_h) % stride_h == 0 - while((hInput - rMin + pad_h) % stride_h != 0 && rMin <= rMax) rMin++; - while((wInput - sMin + pad_w) % stride_w != 0 && sMin <= sMax) sMin++; - - for(int r = rMin; r <= rMax; r += stride_h) { - // Only append value if h == hInput, where h = (r - pad_h) + p*stride_h and 0 <= p < P - // Therefore, p = (hInput - r + pad_h) / stride_h. Use the same logic for q. - final int p = (hInput - r + pad_h) / stride_h; - final int pQ = p*Q; - final int outRowIndex = cInput*RS + r*S; - for(int s = sMin; s <= sMax; s += stride_w) { - int q = (wInput - s + pad_w) / stride_w; - // chw -> [crs, pq] - if( trans ) - output.appendValue(pQ + q, outRowIndex + s, value); - else - output.appendValue(outRowIndex + s, pQ + q, value); - } - } - } - - private static void appendInputValueToIm2colOutputSimple(MatrixBlock output, int c, int h, int w, - double value, int R, int S, int RS, int P, boolean trans) { - int rMin = Math.max(0, h - P + 1); - int rMax = Math.min(R-1, h); - int cix = c*RS+w+rMin*S; - for(int p=h-rMin; p >= h-rMax; p--, cix+=S) - output.appendValue(trans?p:cix, trans?cix:p, value); - } -} http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPooling.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPooling.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPooling.java new file mode 100644 index 0000000..067e9ef --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNPooling.java @@ -0,0 +1,532 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.concurrent.Callable; + +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.matrix.data.LibMatrixDNNHelper.CellIndex3; + +/** + * This class contains the set of operators used for performing pooling + */ +public class LibMatrixDNNPooling { + + // *********************************** low-level runtime operator selection *********************************************** + // *********************************** based on runtime properties (sparsity, native, etc) ******************************** + // These methods help reduce branch miss predictions and instruction-cache misses. + // Also, they simplify the design of LibMatrixDNN and help in code-maintenance. + + /** + * Factory method that returns list of callable tasks for performing maxpooling operation + * + * @param params convolution parameters + * @return list of callable tasks for performing maxpooling operation + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getMaxPoolingWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<>(); + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + for(int i = 0; i*taskSize < params.N; i++) { + if(params.input1.isInSparseFormat()) + ret.add(new SparseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else + ret.add(new DenseMaxPooling(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + } + return ret; + } + + /** + * Factory method that returns list of callable tasks for performing maxpooling backward operation + * + * @param params convolution parameters + * @param performReluBackward whether to perform ReLU backward + * @return list of callable tasks for performing maxpooling backward operation + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getMaxPoolingBackwardWorkers(ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<>(); + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + boolean sparse1 = params.input1.isInSparseFormat(); + boolean sparse2 = params.input2.isInSparseFormat(); + for(int i = 0; i*taskSize < params.N; i++) { + if( !sparse1 && !sparse2 ) + ret.add(new PoolingBackwardDenseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); + else if( !sparse1 && sparse2 ) + ret.add(new PoolingBackwardDenseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); + else if( sparse1 && !sparse2 ) + ret.add(new PoolingBackwardSparseDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); + else if( sparse1 && sparse2 ) + ret.add(new PoolingBackwardSparseSparse(i*taskSize, Math.min((i+1)*taskSize, params.N), params, performReluBackward)); + } + return ret; + } + + private static class DenseMaxPooling implements Callable<Long> + { + private final int _rl, _ru; + private final ConvolutionParameters _params; + + public DenseMaxPooling(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + final int C = _params.C, P = _params.P, Q = _params.Q; + final int R = _params.R, S = _params.S, H = _params.H, W = _params.W; + final int HW = _params.H*_params.W; + final int CHW = _params.C*_params.H*_params.W; + final int CPQ = C*P*Q; + double[] in = _params.input1.getDenseBlockValues(); + double[] out = _params.output.getDenseBlockValues(); + + double minValForMaxPoolOperations = _params.minValForMaxPoolOperations; + + //thread-local initialization of output block + if( !(_params.isStride1Pad0() && _params.isAllOnes(P, Q, W)) ) + Arrays.fill(out, _rl*CPQ, _ru*CPQ, minValForMaxPoolOperations); + + if( _params.isStride1Pad0() && _params.isAllOnes(P, Q, W) ) { + //quick-path w/o materialized index arrays and + //simplified inner loops for P = 1, Q = 1, W = 1 + int lenh = Math.min(R,H); + for(int i = _rl, oix=_rl*C; i < _ru; i++, oix+=C) + for (int c = 0, off=i*CHW; c < C; c++, off+=H) + out[oix+c] = max(minValForMaxPoolOperations, in, off, lenh); + } + else if( _params.isStride1Pad0() ) { + //quick-path w/o materialized index arrays + for(int i = _rl; i < _ru; i++) + for (int c = 0, off=i*CHW, oix=i*CPQ; c < C; c++, off+=HW) + for (int p = 0; p < P; p++, oix+=Q) + for (int h = p; h < Math.min(p+R,H); h++) + for (int q = 0, off2=off+h*W; q < Q; q++) + out[oix+q] = max(out[oix+q], in, off2+q, Math.min(S,W-q)); + } + else { //general case + int[] hl = _params.start_indexes_h, hu = _params.end_indexes_h; + int[] wl = _params.start_indexes_w, wu = _params.end_indexes_w; + for(int i = _rl; i < _ru; i++) + for (int c = 0, off=i*CHW, oix=i*CPQ; c < C; c++, off+=HW) + for (int p = 0; p < P; p++, oix+=Q) + for (int h = hl[p]; h < hu[p]; h++) + for (int q = 0, off2=off+h*W; q < Q; q++) + out[oix+q] = max(out[oix+q], in, off2+wl[q], wu[q]-wl[q]); + } + + //thread-local recomputation of non-zeros + return _params.output.recomputeNonZeros(_rl, _ru-1); + } + } + + private static class SparseMaxPooling implements Callable<Long> + { + private final int _rl, _ru; + private final ConvolutionParameters _params; + private double [] outputArray; + private final int C, P, Q, W, H, CPQ, PQ; + + public SparseMaxPooling(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + outputArray = params.output.getDenseBlockValues(); + C = params.C; P = params.P; Q = params.Q; H = params.H; + W = params.W; + CPQ = C*P*Q; + PQ = P*Q; + } + + @Override + public Long call() throws Exception { + //thread-local initialization of output block + Arrays.fill(outputArray, _rl *CPQ, _ru*CPQ, _params.minValForMaxPoolOperations); + + for(int n = _rl; n < _ru; n++) { + if( !_params.input1.sparseBlock.isEmpty(n) ) { + final int apos = _params.input1.sparseBlock.pos(n); + final int alen = _params.input1.sparseBlock.size(n); + final int [] aix = _params.input1.sparseBlock.indexes(n); + final double [] avals = _params.input1.sparseBlock.values(n); + int chw = 0; int index = apos; + for (int c = 0; c < C; c++) { + final int outOffset = n*CPQ + c*PQ; + for(int h = 0; h < H; h++) { + for(int w = 0; w < W; w++, chw++) { + // Take into account zero values as well + double nchwVal = 0; + if(aix[index] == chw) { + nchwVal = avals[index++]; + // Ensure that we satisfy the condition index < apos+alen + if(index >= apos+alen) index--; + } + // Perform maxpooling without binary search :) + // Tradeoff as compared to dense maxpooling: + // In dense maxpooling, iteration space CPQHW where H and W iterations are restricted by _params.start_indexes_h[p] + // and are eligible for JIT optimizations. + // In sparse maxpooling, iteration space CHWPQ without HW restrictions. + for (int p = 0; p < P; p++) { + if(h >= _params.start_indexes_h[p] && h < _params.end_indexes_h[p]) { + final int outOffsetWithp = outOffset + p*Q; + for (int q = 0; q < Q; q++) { + if(w >= _params.start_indexes_w[q] && w < _params.end_indexes_w[q]) { + outputArray[outOffsetWithp + q] = Math.max(outputArray[outOffsetWithp + q], nchwVal); + } + } + } + } + } + } + } + } + else { + // Empty input image + Arrays.fill(outputArray, n*CPQ, (n+1)*CPQ, 0); + } + } + + //thread-local recomputation of non-zeros + return _params.output.recomputeNonZeros(_rl, _ru-1); + } + } + + //BACKWARD + + /** + * Performs the maxpooling backward operation for dense input and dense error (dout) + */ + private static class PoolingBackwardDenseDense implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + boolean performReluBackward; + double [] inputArray, doutArray; + MatrixBlock output; + int C; int CHW; int P; int Q; int HW; int CPQ; int PQ; + public PoolingBackwardDenseDense(int rl, int ru, ConvolutionParameters params, boolean performReluBackward) { + _rl = rl; _ru = ru; + _params = params; + this.performReluBackward = performReluBackward; + inputArray = params.input1.getDenseBlockValues(); + doutArray = params.input2.getDenseBlockValues(); + output = params.output; + C = params.C; CHW = params.C*params.H*params.W; HW = params.H*params.W; + P = params.P; Q = params.Q; CPQ = params.C*params.P*params.Q; + PQ = params.P*params.Q; + if (inputArray == null || doutArray == null || output.getDenseBlock() == null ) + throw new RuntimeException("Incorrect usage: empty inputs"); + } + + @Override + public Long call() throws Exception { + double[] out = output.getDenseBlockValues(); + for(int n = _rl; n < _ru; n++) { + for (int c = 0; c < C; c++) { + final int inputOffset = n*CHW + c*HW; + final int outputOffset = n*CPQ + c*PQ; + for (int p = 0; p < P; p++) { + for (int q = 0; q < Q; q++) { + int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, _params, performReluBackward); + if(maxIndex != -1) + out[maxIndex] += doutArray[outputOffset + p * Q + q]; + } + } + } + } + //thread-local nnz maintenance + return output.recomputeNonZeros(_rl, _ru-1); + } + } + + /** + * Performs the maxpooling backward operation for dense input and sparse error (dout) + */ + private static class PoolingBackwardDenseSparse implements Callable<Long> + { + public int _rl; public int _ru; + private final ConvolutionParameters _params; + MatrixBlock output; + boolean performReluBackward; + double [] inputArray; MatrixBlock dout; + int CHW; int P; int Q; int HW; + public PoolingBackwardDenseSparse(int rl, int ru, ConvolutionParameters params, boolean performReluBackward) { + _rl = rl; _ru = ru; + _params = params; + this.performReluBackward = performReluBackward; + inputArray = params.input1.getDenseBlockValues(); + dout = params.input2; + output = params.output; + CHW = params.C*params.H*params.W; HW = params.H*params.W; + P = params.P; Q = params.Q; + if (inputArray == null || output.getDenseBlock() == null ) + throw new RuntimeException("Incorrect usage: empty inputs"); + if (!params.input2.isInSparseFormat()) + throw new RuntimeException("Incorrect usage: Call optimized versions"); + } + + @Override + public Long call() throws Exception { + CellIndex3 ix = new CellIndex3(); + double[] out = output.getDenseBlockValues(); + SparseBlock sblock = dout.sparseBlock; + for(int n = _rl; n < _ru; n++) { + if( sblock.isEmpty(n) ) continue; + int apos = sblock.pos(n); + int alen = sblock.size(n); + int[] aix = sblock.indexes(n); + double[] avals = sblock.values(n); + for(int j = apos; j < apos+alen; j++) { + ix = LibMatrixDNNHelper.computeTensorIndexes(aix[j], P, Q, ix); + final int inputOffset = n*CHW + ix.ix1*HW; + int maxIndex = getMaxIndex(ix.ix2, ix.ix3, + inputOffset, inputArray, _params, performReluBackward); + if(maxIndex != -1) + out[maxIndex] += avals[j]; + } + } + //thread-local nnz maintenance + return output.recomputeNonZeros(_rl, _ru-1); + } + } + + /** + * Performs the maxpooling backward operation for sparse input and dense error (dout) + */ + private static class PoolingBackwardSparseDense implements Callable<Long> + { + private final int _rl, _ru; + private final ConvolutionParameters _params; + private final boolean reluBack; + protected final MatrixBlock doutput, output; + + protected PoolingBackwardSparseDense(int rl, int ru, ConvolutionParameters params, boolean relu, MatrixBlock dout, MatrixBlock out) { + _rl = rl; _ru = ru; + _params = params; + reluBack = relu; + doutput = dout; + output = out; + } + + public PoolingBackwardSparseDense(int rl, int ru, ConvolutionParameters params, boolean relu) { + this(rl, ru, params, relu, params.input2, params.output); + if (doutput.getDenseBlock() == null || output.getDenseBlock() == null ) + throw new RuntimeException("Incorrect usage: empty inputs"); + if (!params.input1.isInSparseFormat()) + throw new RuntimeException("Incorrect usage: sparse input1 expected"); + } + + @Override + public Long call() throws Exception + { + final int P = _params.P, Q = _params.Q, W = _params.W; + final int C = _params.C, R = _params.R, S = _params.S; + final int padh = _params.pad_h, padw = _params.pad_w; + final int strideh = _params.stride_h, stridew = _params.stride_w; + final int PQ = _params.P * _params.Q; + final int CPQ = _params.C * _params.P * _params.Q; + final int HW = _params.H * _params.W; + final int CHW = _params.C * _params.H * _params.W; + + //allocate auxiliary data structures + double[] maxVal = new double[PQ]; + int[] maxIx = new int[PQ]; + + for(int n = _rl; n < _ru; n++) { + for (int c = 0; c < C; c++) { + //step 0: basic initializations + final int outOffset = n*CHW + c*HW; + + //step 1: perform maxpooling w/ index maintenance in a + //single, sequential pass over the sparse input matrix + maxpoolingForward(maxVal, maxIx, n, c, + padh, padw, strideh, stridew, C, P, Q, R, S, HW, W); + + //step 2: perform maxpooling backward + maxpoolingBackward(maxIx, outOffset, n, c, C, Q, PQ, CPQ); + } + } + //thread-local nnz maintenance + return output.recomputeNonZeros(_rl, _ru-1); + } + + protected void maxpoolingForward(double[] maxVal, int[] maxIx, int n, int c, int padh, int padw, int strideh, int stridew, int C, int P, int Q, int R, int S, int HW, int W) { + SparseBlock sblock = _params.input1.getSparseBlock(); + if( !sblock.isEmpty(n) ) { + Arrays.fill(maxVal, -Double.MAX_VALUE); + int apos = sblock.pos(n); + int alen = sblock.size(n); + int[] aix = sblock.indexes(n); + double[] avals = sblock.values(n); + //find channel start and end, w/ robustness for non-existing entries + int cpos = (c==0) ? 0 : sblock.posFIndexGTE(n, c*HW); + int cpos2 = (c+1==C) ? alen : sblock.posFIndexGTE(n, (c+1)*HW); + cpos = (cpos>=0) ? cpos : alen; + cpos2 = (cpos2>=0) ? cpos2 : alen; + int lastix = c*HW-1; + for(int j=apos+cpos; j<apos+cpos2; j++) { + //handle skipped zero values + update0(lastix+1, aix[j], maxVal, maxIx, padh, padw, strideh, stridew, P, Q, R, S, HW, W); + //handle current non-zero value + int h = (aix[j] % HW) / W; + int w = aix[j] % W; + double val = reluBack && avals[j] < 0 ? 0 : avals[j]; + update(val, maxVal, maxIx, h, w, padh, padw, strideh, stridew, P, Q, R, S, W); + //memoize last seen index + lastix = aix[j]; + } + //handle skipped zero values at end of row + update0(lastix+1, (c+1)*HW, maxVal, maxIx, padh, padw, strideh, stridew, P, Q, R, S, HW, W); + } + else { + //handle empty row + Arrays.fill(maxVal, 0); + for(int p = 0, ix=0; p < P; p++) { + int h = Math.max(-padh+p*strideh, 0); + for(int q = 0; q < Q; q++, ix++) { + int w = Math.max(-padw+q*stridew, 0); + maxIx[ix] = h * W + w; + } + } + } + } + + protected void maxpoolingBackward(int[] maxIx, int outOffset, int n, int c, int C, int Q, int PQ, int CPQ) { + double[] dout = doutput.getDenseBlockValues(); + double[] out = output.getDenseBlockValues(); + final int doutOffset = n*CPQ + c*PQ; + for( int pq = 0; pq < PQ; pq++ ) + out[ outOffset + maxIx[pq] ] += dout[ doutOffset + pq ]; + } + + private static void update0(int lix, int uix, double[] maxVal, int[] maxIx, int padh, int padw, int strideh, int stridew, int P, int Q, int R, int S, int HW, int W) { + //TODO exploit constant value and overlap for potential early abort + for(int i = lix; i<uix; i++) + update(0, maxVal, maxIx, (i%HW)/W, i%W, padh, padw, strideh, stridew, P, Q, R, S, W); + } + + private static void update(double val, double[] maxVal, int[] maxIx, int h, int w, int padh, int padw, int strideh, int stridew, int P, int Q, int R, int S, int W) { + //determine lower and upper bounds for p and q + //(see fillIndexesArray, solved for p and q, reversed) + int lp = Math.max((h+padh-R+strideh)/strideh, 0); + int up = Math.min((h+padh+strideh)/strideh, P); + int lq = Math.max((w+padw-S+stridew)/stridew, 0); + int uq = Math.min((w+padw+stridew)/stridew, Q); + + //maintain max index for all relevant p and q + int maxIndex = h * W + w; + for(int p = lp; p < up; p++) + for(int q = lq; q < uq; q++) { + int ix = p * Q + q; + if( maxVal[ix] < val ) { + maxVal[ix] = val; + maxIx[ix] = maxIndex; + } + } + } + } + + /** + * Performs the maxpooling backward operation for sparse input and sparse error (dout) + */ + private static class PoolingBackwardSparseSparse extends PoolingBackwardSparseDense + { + public PoolingBackwardSparseSparse(int rl, int ru, ConvolutionParameters params, boolean relu) { + super(rl, ru, params, relu, params.input2, params.output); + if (output.getDenseBlock() == null ) + throw new RuntimeException("Incorrect usage: empty outputs"); + if (!params.input1.isInSparseFormat() || !params.input2.isInSparseFormat()) + throw new RuntimeException("Incorrect usage: Call optimized versions"); + } + + @Override + protected void maxpoolingBackward(int[] maxIx, int outOffset, int n, int c, int C, int Q, int PQ, int CPQ) { + SparseBlock sblock = doutput.getSparseBlock(); + double[] out = output.getDenseBlockValues(); + if( sblock.isEmpty(n) ) + return; + int apos = sblock.pos(n); + int alen = sblock.size(n); + int[] aix = sblock.indexes(n); + double[] avals = sblock.values(n); + //find channel start and end, w/ robustness for non-existing entries + int cpos = (c==0) ? 0 : sblock.posFIndexGTE(n, c*PQ); + int cpos2 = (c+1==C) ? alen : sblock.posFIndexGTE(n, (c+1)*PQ); + cpos = (cpos>=0) ? cpos : alen; + cpos2 = (cpos2>=0) ? cpos2 : alen; + for(int j = apos+cpos; j<apos+cpos2; j++) { + int p = (aix[j] % PQ) / Q; + int q = aix[j] % Q; + int pq = p * Q + q; + out[ outOffset + maxIx[pq] ] += avals[j]; + } + } + } + + private static double max(final double aval, double[] b, final int bi, final int len) { + double ret = aval; + for( int i = bi; i < bi+len; i++ ) + ret = Math.max(ret, b[i]); + return ret; + } + + /** + * Returns the index of cell with maximum value. This method is optimized for dense input + * + * @param p output feature map height + * @param q output feature map width + * @param inputOffset offset to be used for input index + * @param inputArray input array + * @param params convolution parameters + * @param performReluBackward perform ReLU backward + * @return index of cell with maximum value + */ + private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params, boolean performReluBackward) { + int start_index_h = params.start_indexes_h[p]; + int end_index_h = params.end_indexes_h[p]; + int start_index_w = params.start_indexes_w[q]; + int end_index_w = params.end_indexes_w[q]; + + int maxIndex = -1; + double maxVal = -Double.MAX_VALUE; + + // Note: We do not treat pad as zero and hence we don't do: + // maxVal = 0 + // if start_index_h < 0 || start_index_w < 0 || end_index_h >= params.H || end_index_w >= params.W + + // Find maxIndex + double currDoutVal = -1; + for (int h = start_index_h; h < end_index_h; h++) { + for (int w = start_index_w; w < end_index_w; w++) { + currDoutVal = inputArray[inputOffset + h*params.W + w]; + currDoutVal = performReluBackward && currDoutVal < 0 ? 0 : currDoutVal; + if(maxVal < currDoutVal) { + maxIndex = inputOffset + h*params.W + w; + maxVal = currDoutVal; + } + } + } + return maxIndex; + } +}
