http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/hops/IndexingOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/IndexingOp.java b/src/main/java/org/apache/sysml/hops/IndexingOp.java index 5f2ce34..6a3ddf4 100644 --- a/src/main/java/org/apache/sysml/hops/IndexingOp.java +++ b/src/main/java/org/apache/sysml/hops/IndexingOp.java @@ -19,6 +19,7 @@ package org.apache.sysml.hops; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.hops.AggBinaryOp.SparkAggType; import org.apache.sysml.hops.rewrite.HopRewriteUtils; import org.apache.sysml.lops.Aggregate; @@ -97,7 +98,13 @@ public class IndexingOp extends Hop @Override public boolean isGPUEnabled() { - return false; + if(!DMLScript.USE_ACCELERATOR) { + return false; + } + else { + // only matrix indexing is supported on GPU + return (getDataType() == DataType.MATRIX); + } } @Override @@ -172,7 +179,7 @@ public class IndexingOp extends Hop setLineNumbers(reindex); setLops(reindex); } - else //CP + else //CP or GPU { Lop dummy = Data.createLiteralLop(ValueType.INT, Integer.toString(-1)); RangeBasedReIndex reindex = new RangeBasedReIndex(
http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 36f57b4..8abfc69 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -27,6 +27,7 @@ import org.apache.sysml.runtime.instructions.gpu.BuiltinBinaryGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.BuiltinUnaryGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.ConvolutionGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.MatrixIndexingGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.MatrixMatrixAxpyGPUInstruction; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE; import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction; @@ -128,6 +129,10 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( ">" , GPUINSTRUCTION_TYPE.RelationalBinary); String2GPUInstructionType.put( "<=" , GPUINSTRUCTION_TYPE.RelationalBinary); String2GPUInstructionType.put( ">=" , GPUINSTRUCTION_TYPE.RelationalBinary); + + // Indexing + // right indexing: output = X[1:3, 4:5] + String2GPUInstructionType.put( "rangeReIndex", GPUINSTRUCTION_TYPE.MatrixIndexing); } public static GPUInstruction parseSingleInstruction (String str ) @@ -187,6 +192,9 @@ public class GPUInstructionParser extends InstructionParser case RelationalBinary: return RelationalBinaryGPUInstruction.parseInstruction(str); + case MatrixIndexing: + return MatrixIndexingGPUInstruction.parseInstruction(str); + default: throw new DMLRuntimeException("Invalid GPU Instruction Type: " + gputype ); } http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index b962eb7..0f0b28e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -43,7 +43,8 @@ public abstract class GPUInstruction extends Instruction ArithmeticBinary, BuiltinUnary, BuiltinBinary, - Builtin + Builtin, + MatrixIndexing }; // Memory/conversions @@ -112,6 +113,9 @@ public abstract class GPUInstruction extends Instruction public final static String MISC_TIMER_REDUCE_ALL_KERNEL = "rallk"; // time spent in reduce all kernel public final static String MISC_TIMER_REDUCE_ROW_KERNEL = "rrowk"; // time spent in reduce row kernel public final static String MISC_TIMER_REDUCE_COL_KERNEL = "rcolk"; // time spent in reduce column kernel + + public final static String MISC_TIMER_RIX_DENSE_OP = "drix"; // time spent in the right indexing dense kernel + public final static String MISC_TIMER_RIX_SPARSE_DENSE_OP = "sdrix"; // time spent in the right indexing sparse dense kernel // Deep learning operators public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB = "nnaf"; // time spent in cudnnActivationForward http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java new file mode 100644 index 0000000..5e2c8fc --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu; + +import org.apache.sysml.parser.Expression.DataType; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.instructions.InstructionUtils; +import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.operators.Operator; +import org.apache.sysml.runtime.matrix.operators.SimpleOperator; +import org.apache.sysml.runtime.util.IndexRange; +import org.apache.sysml.utils.GPUStatistics; + +public class MatrixIndexingGPUInstruction extends GPUInstruction { + CPOperand rowLower, rowUpper, colLower, colUpper; + CPOperand input1; CPOperand input2; CPOperand output; + + public MatrixIndexingGPUInstruction(Operator op, CPOperand in, + CPOperand rl, CPOperand ru, CPOperand cl, CPOperand cu, CPOperand out, String opcode, String istr){ + super(op, opcode, istr); + _gputype = GPUINSTRUCTION_TYPE.MatrixIndexing; + rowLower = rl; + rowUpper = ru; + colLower = cl; + colUpper = cu; + input1 = in; + output = out; + } + + public MatrixIndexingGPUInstruction(Operator op, CPOperand lhsInput, CPOperand rhsInput, + CPOperand rl, CPOperand ru, CPOperand cl, CPOperand cu, CPOperand out, String opcode, String istr){ + super(op, opcode, istr); + _gputype = GPUINSTRUCTION_TYPE.MatrixIndexing; + rowLower = rl; + rowUpper = ru; + colLower = cl; + colUpper = cu; + input1 = lhsInput; + input2 = rhsInput; + output = out; + } + + public static MatrixIndexingGPUInstruction parseInstruction ( String str ) throws DMLRuntimeException { + String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); + String opcode = parts[0]; + + if ( opcode.equalsIgnoreCase("rangeReIndex") ) { + if ( parts.length == 7 ) { + // Example: rangeReIndex:mVar1:Var2:Var3:Var4:Var5:mVar6 + CPOperand in, rl, ru, cl, cu, out; + in = new CPOperand(); + rl = new CPOperand(); + ru = new CPOperand(); + cl = new CPOperand(); + cu = new CPOperand(); + out = new CPOperand(); + in.split(parts[1]); + rl.split(parts[2]); + ru.split(parts[3]); + cl.split(parts[4]); + cu.split(parts[5]); + out.split(parts[6]); + if( in.getDataType()==DataType.MATRIX ) + return new MatrixIndexingGPUInstruction(new SimpleOperator(null), in, rl, ru, cl, cu, out, opcode, str); + else + throw new DMLRuntimeException("Can index only on Matrices in GPU"); + } + else { + throw new DMLRuntimeException("Invalid number of operands in instruction: " + str); + } + } + else if ( opcode.equalsIgnoreCase("leftIndex")) { + if ( parts.length == 8 ) { + // Example: leftIndex:mVar1:mvar2:Var3:Var4:Var5:Var6:mVar7 + CPOperand lhsInput, rhsInput, rl, ru, cl, cu, out; + lhsInput = new CPOperand(); + rhsInput = new CPOperand(); + rl = new CPOperand(); + ru = new CPOperand(); + cl = new CPOperand(); + cu = new CPOperand(); + out = new CPOperand(); + lhsInput.split(parts[1]); + rhsInput.split(parts[2]); + rl.split(parts[3]); + ru.split(parts[4]); + cl.split(parts[5]); + cu.split(parts[6]); + out.split(parts[7]); + if( lhsInput.getDataType()==DataType.MATRIX ) + return new MatrixIndexingGPUInstruction(new SimpleOperator(null), lhsInput, rhsInput, rl, ru, cl, cu, out, opcode, str); + else + throw new DMLRuntimeException("Can index only on Matrices in GPU"); + } + else { + throw new DMLRuntimeException("Invalid number of operands in instruction: " + str); + } + } + else { + throw new DMLRuntimeException("Unknown opcode while parsing a MatrixIndexingGPUInstruction: " + str); + } + } + + @Override + public void processInstruction(ExecutionContext ec) + throws DMLRuntimeException { + GPUStatistics.incrementNoOfExecutedGPUInst(); + String opcode = getOpcode(); + + IndexRange ixrange = getIndexRange(ec); + if ( opcode.equalsIgnoreCase("rangeReIndex") ) { + MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, input1.getName()); + LibMatrixCUDA.sliceOperations(ec, ec.getGPUContext(0), getExtendedOpcode(), mat1, ixrange, output.getName()); + ec.releaseMatrixInputForGPUInstruction(input1.getName()); + ec.releaseMatrixOutputForGPUInstruction(output.getName()); + } + else { + throw new DMLRuntimeException("Unsupported GPU operator:" + opcode); + } + } + + IndexRange getIndexRange(ExecutionContext ec) throws DMLRuntimeException { + return new IndexRange( //rl, ru, cl, ru + (int)(ec.getScalarInput(rowLower.getName(), rowLower.getValueType(), rowLower.isLiteral()).getLongValue()-1), + (int)(ec.getScalarInput(rowUpper.getName(), rowUpper.getValueType(), rowUpper.isLiteral()).getLongValue()-1), + (int)(ec.getScalarInput(colLower.getName(), colLower.getValueType(), colLower.isLiteral()).getLongValue()-1), + (int)(ec.getScalarInput(colUpper.getName(), colUpper.getValueType(), colUpper.isLiteral()).getLongValue()-1)); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index 7244938..a5bc299 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -49,6 +49,18 @@ import jcuda.jcusparse.cusparsePointerMode; /** * Compressed Sparse Row (CSR) format for CUDA * Generalized matrix multiply is implemented for CSR format in the cuSparse library among other operations + * + * Since we assume that the matrix is stored with zero-based indexing (i.e. CUSPARSE_INDEX_BASE_ZERO), + * the matrix + * 1.0 4.0 0.0 0.0 0.0 + * 0.0 2.0 3.0 0.0 0.0 + * 5.0 0.0 0.0 7.0 8.0 + * 0.0 0.0 9.0 0.0 6.0 + * + * is stored as + * val = 1.0 4.0 2.0 3.0 5.0 7.0 8.0 9.0 6.0 + * rowPtr = 0.0 2.0 4.0 7.0 9.0 + * colInd = 0.0 1.0 1.0 2.0 0.0 3.0 4.0 2.0 4.0 */ public class CSRPointer { @@ -184,9 +196,9 @@ public class CSRPointer { cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), cudaMemcpyHostToDevice); cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), cudaMemcpyHostToDevice); if (DMLScript.STATISTICS) - GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - t0); + GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) - GPUStatistics.cudaToDevCount.addAndGet(3); + GPUStatistics.cudaToDevCount.add(3); } /** @@ -208,9 +220,9 @@ public class CSRPointer { cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), cudaMemcpyDeviceToHost); cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), cudaMemcpyDeviceToHost); if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0); + GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevCount.addAndGet(3); + GPUStatistics.cudaFromDevCount.add(3); } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index f107f47..84d181b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -147,7 +147,7 @@ public class GPUContext { if (DMLScript.STATISTICS) GPUStatistics.cudaLibrariesInitTime = System.nanoTime() - start; - + LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, Available: " + (free[0] * (1e-6)) + " MB on " + this); @@ -269,7 +269,7 @@ public class GPUContext { freeCUDASpaceMap.remove(size); if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics - .maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0); + .maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0); } else { LOG.trace( "GPU : in allocate from instruction " + instructionName + ", allocating new block of size " + (size @@ -280,9 +280,9 @@ public class GPUContext { A = new Pointer(); cudaMalloc(A, size); if (DMLScript.STATISTICS) - GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0); + GPUStatistics.cudaAllocTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) - GPUStatistics.cudaAllocCount.getAndAdd(statsCount); + GPUStatistics.cudaAllocCount.add(statsCount); if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_ALLOCATE, System.nanoTime() - t0); @@ -298,9 +298,9 @@ public class GPUContext { if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1); if (DMLScript.STATISTICS) - GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1); + GPUStatistics.cudaMemSet0Time.add(end - t1); if (DMLScript.STATISTICS) - GPUStatistics.cudaMemSet0Count.getAndAdd(1); + GPUStatistics.cudaMemSet0Count.add(1); cudaBlockSizeMap.put(A, size); return A; @@ -349,32 +349,32 @@ public class GPUContext { long t0 = 0; assert cudaBlockSizeMap.containsKey( toFree) : "ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"; - long size = cudaBlockSizeMap.get(toFree); - if (eager) { - LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " - + this); - if (DMLScript.STATISTICS) - t0 = System.nanoTime(); - cudaFree(toFree); - cudaBlockSizeMap.remove(toFree); - if (DMLScript.STATISTICS) - GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0); - if (DMLScript.STATISTICS) - GPUStatistics.cudaDeAllocCount.addAndGet(1); - if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) - GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, - System.nanoTime() - t0); - } else { - LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this); - LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); - if (freeList == null) { - freeList = new LinkedList<Pointer>(); - freeCUDASpaceMap.put(size, freeList); - } - if (freeList.contains(toFree)) - throw new RuntimeException("GPU : Internal state corrupted, double free"); - freeList.add(toFree); - } + long size = cudaBlockSizeMap.get(toFree); + if (eager) { + LOG.trace("GPU : eagerly freeing cuda memory [ " + toFree + " ] for instruction " + instructionName + " on " + + this); + if (DMLScript.STATISTICS) + t0 = System.nanoTime(); + cudaFree(toFree); + cudaBlockSizeMap.remove(toFree); + if (DMLScript.STATISTICS) + GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0); + if (DMLScript.STATISTICS) + GPUStatistics.cudaDeAllocCount.add(1); + if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) + GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_CUDA_FREE, + System.nanoTime() - t0); + } else { + LOG.trace("GPU : lazily freeing cuda memory for instruction " + instructionName + " on " + this); + LinkedList<Pointer> freeList = freeCUDASpaceMap.get(size); + if (freeList == null) { + freeList = new LinkedList<Pointer>(); + freeCUDASpaceMap.put(size, freeList); + } + if (freeList.contains(toFree)) + throw new RuntimeException("GPU : Internal state corrupted, double free"); + freeList.add(toFree); + } } /** @@ -426,7 +426,7 @@ public class GPUContext { */ protected void evict(String instructionName, final long neededSize) throws DMLRuntimeException { LOG.trace("GPU : evict called from " + instructionName + " for size " + neededSize + " on " + this); - GPUStatistics.cudaEvictionCount.addAndGet(1); + GPUStatistics.cudaEvictionCount.add(1); // Release the set of free blocks maintained in a GPUObject.freeCUDASpaceMap // to free up space LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = freeCUDASpaceMap; http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 94ceb36..c3e23f3 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -32,9 +32,7 @@ import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; import java.util.Arrays; -import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; @@ -100,12 +98,12 @@ public class GPUObject { /** * number of read/write locks on this object (this GPUObject is being used in a current instruction) */ - protected AtomicInteger locks = new AtomicInteger(0); + protected AtomicLong locks = new AtomicLong(); /** * Timestamp, needed by {@link GPUContext#evict(long)} */ - AtomicLong timestamp = new AtomicLong(0); + AtomicLong timestamp = new AtomicLong(); /** * Whether this block is in sparse format @@ -131,7 +129,7 @@ public class GPUObject { that.allocateTensorDescriptor(me.tensorShape[0], me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]); } that.dirty = me.dirty; - that.locks = new AtomicInteger(me.locks.get()); + that.locks = new AtomicLong(me.locks.get()); that.timestamp = new AtomicLong(me.timestamp.get()); that.isSparse = me.isSparse; @@ -238,9 +236,9 @@ public class GPUObject { t2 = System.nanoTime(); cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2); + GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t2); if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevCount.addAndGet(1); + GPUStatistics.cudaFromDevCount.add(1); if (nnzC[0] == -1) { throw new DMLRuntimeException( @@ -353,9 +351,9 @@ public class GPUObject { cols)); // TODO: What if mat.getNnz() is -1 ? if (DMLScript.STATISTICS) - GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0); + GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) - GPUStatistics.cudaDenseToSparseCount.addAndGet(1); + GPUStatistics.cudaDenseToSparseCount.add(1); } /** @@ -430,9 +428,9 @@ public class GPUObject { if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start); if (DMLScript.STATISTICS) - GPUStatistics.cudaSparseToDenseTime.addAndGet(end - start); + GPUStatistics.cudaSparseToDenseTime.add(end - start); if (DMLScript.STATISTICS) - GPUStatistics.cudaSparseToDenseCount.addAndGet(1); + GPUStatistics.cudaSparseToDenseCount.add(1); } /** @@ -560,8 +558,8 @@ public class GPUObject { // If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0 if (v != 0) getGPUContext().getKernels() - .launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), - getJcudaDenseMatrixPtr(), v, numElems); + .launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), + getJcudaDenseMatrixPtr(), v, numElems); } /** @@ -657,12 +655,12 @@ public class GPUObject { * @throws DMLRuntimeException if there is no locked GPU Object or if could not obtain a {@link GPUContext} */ private void updateReleaseLocks(int l) throws DMLRuntimeException { - int newLocks = locks.addAndGet(l); + int newLocks = (int) locks.addAndGet(l); if (newLocks < 0) { throw new CacheException("Internal state error : Invalid number of locks on a GPUObject"); } - LOG.trace("GPU : updateReleaseLocks, new number of locks is " + locks.get() + ", on " + this + ", GPUContext=" + LOG.trace("GPU : updateReleaseLocks, new number of locks is " + newLocks + ", on " + this + ", GPUContext=" + getGPUContext()); GPUContext.EvictionPolicy evictionPolicy = getGPUContext().evictionPolicy; switch (evictionPolicy) { @@ -802,18 +800,18 @@ public class GPUObject { csrBlock = new SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(), cooBlock.indexes(), cooBlock.values()); if (DMLScript.STATISTICS) - GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); + GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) - GPUStatistics.cudaSparseConversionCount.incrementAndGet(); + GPUStatistics.cudaSparseConversionCount.increment(); } else if (block instanceof SparseBlockMCSR) { if (DMLScript.STATISTICS) t0 = System.nanoTime(); SparseBlockMCSR mcsrBlock = (SparseBlockMCSR) block; csrBlock = new SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size())); if (DMLScript.STATISTICS) - GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); + GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0); if (DMLScript.STATISTICS) - GPUStatistics.cudaSparseConversionCount.incrementAndGet(); + GPUStatistics.cudaSparseConversionCount.increment(); } else { throw new DMLRuntimeException("Unsupported sparse matrix format for CUDA operations"); } @@ -848,9 +846,9 @@ public class GPUObject { mat.release(); if (DMLScript.STATISTICS) - GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() - start); + GPUStatistics.cudaToDevTime.add(System.nanoTime() - start); if (DMLScript.STATISTICS) - GPUStatistics.cudaToDevCount.addAndGet(1); + GPUStatistics.cudaToDevCount.add(1); } public static int toIntExact(long l) throws DMLRuntimeException { @@ -882,9 +880,9 @@ public class GPUObject { mat.release(); if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start); + GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start); if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevCount.addAndGet(1); + GPUStatistics.cudaFromDevCount.add(1); } else if (getJcudaSparseMatrixPtr() != null) { if (!LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) throw new DMLRuntimeException( @@ -912,9 +910,9 @@ public class GPUObject { mat.acquireModify(tmp); mat.release(); if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start); + GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start); if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevCount.addAndGet(1); + GPUStatistics.cudaFromDevCount.add(1); } } else { throw new DMLRuntimeException(
