http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/hops/IndexingOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/IndexingOp.java 
b/src/main/java/org/apache/sysml/hops/IndexingOp.java
index 5f2ce34..6a3ddf4 100644
--- a/src/main/java/org/apache/sysml/hops/IndexingOp.java
+++ b/src/main/java/org/apache/sysml/hops/IndexingOp.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysml.hops;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.AggBinaryOp.SparkAggType;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.lops.Aggregate;
@@ -97,7 +98,13 @@ public class IndexingOp extends Hop
        
        @Override
        public boolean isGPUEnabled() {
-               return false;
+               if(!DMLScript.USE_ACCELERATOR) {
+                       return false;
+               }
+               else {
+                       // only matrix indexing is supported on GPU
+                       return (getDataType() == DataType.MATRIX);
+               }
        }
 
        @Override
@@ -172,7 +179,7 @@ public class IndexingOp extends Hop
                                        setLineNumbers(reindex);
                                        setLops(reindex);
                                }
-                               else //CP
+                               else //CP or GPU
                                {
                                        Lop dummy = 
Data.createLiteralLop(ValueType.INT, Integer.toString(-1));
                                        RangeBasedReIndex reindex = new 
RangeBasedReIndex(

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 36f57b4..8abfc69 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -27,6 +27,7 @@ import 
org.apache.sysml.runtime.instructions.gpu.BuiltinBinaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.BuiltinUnaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.ConvolutionGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.MatrixIndexingGPUInstruction;
 import 
org.apache.sysml.runtime.instructions.gpu.MatrixMatrixAxpyGPUInstruction;
 import 
org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE;
 import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction;
@@ -128,6 +129,10 @@ public class GPUInstructionParser  extends 
InstructionParser
                String2GPUInstructionType.put( ">"    , 
GPUINSTRUCTION_TYPE.RelationalBinary);
                String2GPUInstructionType.put( "<="   , 
GPUINSTRUCTION_TYPE.RelationalBinary);
                String2GPUInstructionType.put( ">="   , 
GPUINSTRUCTION_TYPE.RelationalBinary);
+               
+               // Indexing 
+               // right indexing: output = X[1:3, 4:5]
+               String2GPUInstructionType.put( "rangeReIndex", 
GPUINSTRUCTION_TYPE.MatrixIndexing); 
        }
        
        public static GPUInstruction parseSingleInstruction (String str ) 
@@ -187,6 +192,9 @@ public class GPUInstructionParser  extends InstructionParser
                        case RelationalBinary:
                                return 
RelationalBinaryGPUInstruction.parseInstruction(str);
 
+                       case MatrixIndexing:
+                               return 
MatrixIndexingGPUInstruction.parseInstruction(str);
+                               
                        default: 
                                throw new DMLRuntimeException("Invalid GPU 
Instruction Type: " + gputype );
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index b962eb7..0f0b28e 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -43,7 +43,8 @@ public abstract class GPUInstruction extends Instruction
                ArithmeticBinary,
                BuiltinUnary,
                BuiltinBinary,
-               Builtin
+               Builtin,
+               MatrixIndexing
        };
 
        // Memory/conversions
@@ -112,6 +113,9 @@ public abstract class GPUInstruction extends Instruction
        public final static String MISC_TIMER_REDUCE_ALL_KERNEL =               
 "rallk";  // time spent in reduce all kernel
        public final static String MISC_TIMER_REDUCE_ROW_KERNEL =               
 "rrowk";  // time spent in reduce row kernel
        public final static String MISC_TIMER_REDUCE_COL_KERNEL =               
 "rcolk";  // time spent in reduce column kernel
+       
+       public final static String MISC_TIMER_RIX_DENSE_OP =                    
 "drix";    // time spent in the right indexing dense kernel
+       public final static String MISC_TIMER_RIX_SPARSE_DENSE_OP =             
 "sdrix";   // time spent in the right indexing sparse dense kernel
 
        // Deep learning operators
        public final static String MISC_TIMER_ACTIVATION_FORWARD_LIB =         
"nnaf";  // time spent in cudnnActivationForward

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
new file mode 100644
index 0000000..5e2c8fc
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MatrixIndexingGPUInstruction.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu;
+
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.runtime.matrix.operators.SimpleOperator;
+import org.apache.sysml.runtime.util.IndexRange;
+import org.apache.sysml.utils.GPUStatistics;
+
+public class MatrixIndexingGPUInstruction extends GPUInstruction {
+       CPOperand rowLower, rowUpper, colLower, colUpper;
+       CPOperand input1; CPOperand input2; CPOperand output;
+       
+       public MatrixIndexingGPUInstruction(Operator op, CPOperand in, 
+                       CPOperand rl, CPOperand ru, CPOperand cl, CPOperand cu, 
CPOperand out, String opcode, String istr){
+               super(op, opcode, istr);
+               _gputype = GPUINSTRUCTION_TYPE.MatrixIndexing;
+               rowLower = rl;
+               rowUpper = ru;
+               colLower = cl;
+               colUpper = cu;
+               input1 = in;
+               output = out;
+       }
+       
+       public MatrixIndexingGPUInstruction(Operator op, CPOperand lhsInput, 
CPOperand rhsInput, 
+                       CPOperand rl, CPOperand ru, CPOperand cl, CPOperand cu, 
CPOperand out, String opcode, String istr){
+               super(op, opcode, istr);
+               _gputype = GPUINSTRUCTION_TYPE.MatrixIndexing;
+               rowLower = rl;
+               rowUpper = ru;
+               colLower = cl;
+               colUpper = cu;
+               input1 = lhsInput;
+               input2 = rhsInput;
+               output = out;
+       }
+       
+       public static MatrixIndexingGPUInstruction parseInstruction ( String 
str ) throws DMLRuntimeException {
+               String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
+               String opcode = parts[0];
+               
+               if ( opcode.equalsIgnoreCase("rangeReIndex") ) {
+                       if ( parts.length == 7 ) {
+                               // Example: 
rangeReIndex:mVar1:Var2:Var3:Var4:Var5:mVar6
+                               CPOperand in, rl, ru, cl, cu, out;
+                               in = new CPOperand();
+                               rl = new CPOperand();
+                               ru = new CPOperand();
+                               cl = new CPOperand();
+                               cu = new CPOperand();
+                               out = new CPOperand();
+                               in.split(parts[1]);
+                               rl.split(parts[2]);
+                               ru.split(parts[3]);
+                               cl.split(parts[4]);
+                               cu.split(parts[5]);
+                               out.split(parts[6]);
+                               if( in.getDataType()==DataType.MATRIX )
+                                       return new 
MatrixIndexingGPUInstruction(new SimpleOperator(null), in, rl, ru, cl, cu, out, 
opcode, str);
+                               else 
+                                       throw new DMLRuntimeException("Can 
index only on Matrices in GPU");
+                       }
+                       else {
+                               throw new DMLRuntimeException("Invalid number 
of operands in instruction: " + str);
+                       }
+               } 
+               else if ( opcode.equalsIgnoreCase("leftIndex")) {
+                       if ( parts.length == 8 ) {
+                               // Example: 
leftIndex:mVar1:mvar2:Var3:Var4:Var5:Var6:mVar7
+                               CPOperand lhsInput, rhsInput, rl, ru, cl, cu, 
out;
+                               lhsInput = new CPOperand();
+                               rhsInput = new CPOperand();
+                               rl = new CPOperand();
+                               ru = new CPOperand();
+                               cl = new CPOperand();
+                               cu = new CPOperand();
+                               out = new CPOperand();
+                               lhsInput.split(parts[1]);
+                               rhsInput.split(parts[2]);
+                               rl.split(parts[3]);
+                               ru.split(parts[4]);
+                               cl.split(parts[5]);
+                               cu.split(parts[6]);
+                               out.split(parts[7]);
+                               if( lhsInput.getDataType()==DataType.MATRIX )
+                                       return new 
MatrixIndexingGPUInstruction(new SimpleOperator(null), lhsInput, rhsInput, rl, 
ru, cl, cu, out, opcode, str);
+                               else 
+                                       throw new DMLRuntimeException("Can 
index only on Matrices in GPU");
+                       }
+                       else {
+                               throw new DMLRuntimeException("Invalid number 
of operands in instruction: " + str);
+                       }
+               }
+               else {
+                       throw new DMLRuntimeException("Unknown opcode while 
parsing a MatrixIndexingGPUInstruction: " + str);
+               }
+       }
+
+       @Override
+       public void processInstruction(ExecutionContext ec)
+                       throws DMLRuntimeException {
+               GPUStatistics.incrementNoOfExecutedGPUInst();
+               String opcode = getOpcode();
+               
+               IndexRange ixrange = getIndexRange(ec);
+               if ( opcode.equalsIgnoreCase("rangeReIndex") ) {
+                       MatrixObject mat1 = getMatrixInputForGPUInstruction(ec, 
input1.getName());
+                       LibMatrixCUDA.sliceOperations(ec, ec.getGPUContext(0), 
getExtendedOpcode(), mat1, ixrange, output.getName());
+                       
ec.releaseMatrixInputForGPUInstruction(input1.getName());
+                       
ec.releaseMatrixOutputForGPUInstruction(output.getName());
+               }
+               else {
+                       throw new DMLRuntimeException("Unsupported GPU 
operator:" + opcode);
+               }
+       }
+       
+       IndexRange getIndexRange(ExecutionContext ec) throws 
DMLRuntimeException {
+               return new IndexRange( //rl, ru, cl, ru
+                       (int)(ec.getScalarInput(rowLower.getName(), 
rowLower.getValueType(), rowLower.isLiteral()).getLongValue()-1),
+                       (int)(ec.getScalarInput(rowUpper.getName(), 
rowUpper.getValueType(), rowUpper.isLiteral()).getLongValue()-1),
+                       (int)(ec.getScalarInput(colLower.getName(), 
colLower.getValueType(), colLower.isLiteral()).getLongValue()-1),
+                       (int)(ec.getScalarInput(colUpper.getName(), 
colUpper.getValueType(), colUpper.isLiteral()).getLongValue()-1));          
+       }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
index 7244938..a5bc299 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java
@@ -49,6 +49,18 @@ import jcuda.jcusparse.cusparsePointerMode;
 /**
  * Compressed Sparse Row (CSR) format for CUDA
  * Generalized matrix multiply is implemented for CSR format in the cuSparse 
library among other operations
+ * 
+ * Since we assume that the matrix is stored with zero-based indexing (i.e. 
CUSPARSE_INDEX_BASE_ZERO),
+ * the matrix
+ * 1.0 4.0 0.0 0.0 0.0 
+ * 0.0 2.0 3.0 0.0 0.0 
+ * 5.0 0.0 0.0 7.0 8.0 
+ * 0.0 0.0 9.0 0.0 6.0
+ * 
+ * is stored as
+ * val = 1.0 4.0 2.0 3.0 5.0 7.0 8.0 9.0 6.0 
+ * rowPtr = 0.0 2.0 4.0 7.0 9.0 
+ * colInd = 0.0 1.0 1.0 2.0 0.0 3.0 4.0 2.0 4.0
  */
 public class CSRPointer {
 
@@ -184,9 +196,9 @@ public class CSRPointer {
                cudaMemcpy(r.colInd, Pointer.to(colInd), getIntSizeOf(nnz), 
cudaMemcpyHostToDevice);
                cudaMemcpy(r.val, Pointer.to(values), getDoubleSizeOf(nnz), 
cudaMemcpyHostToDevice);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() 
- t0);
+                       GPUStatistics.cudaToDevTime.add(System.nanoTime() - t0);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaToDevCount.addAndGet(3);
+                       GPUStatistics.cudaToDevCount.add(3);
        }
 
        /**
@@ -208,9 +220,9 @@ public class CSRPointer {
                cudaMemcpy(Pointer.to(colInd), r.colInd, getIntSizeOf(nnz), 
cudaMemcpyDeviceToHost);
                cudaMemcpy(Pointer.to(values), r.val, getDoubleSizeOf(nnz), 
cudaMemcpyDeviceToHost);
                if (DMLScript.STATISTICS)
-                       
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t0);
+                       GPUStatistics.cudaFromDevTime.add(System.nanoTime() - 
t0);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaFromDevCount.addAndGet(3);
+                       GPUStatistics.cudaFromDevCount.add(3);
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index f107f47..84d181b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -147,7 +147,7 @@ public class GPUContext {
 
                if (DMLScript.STATISTICS)
                        GPUStatistics.cudaLibrariesInitTime = System.nanoTime() 
- start;
-               
+
                LOG.info(" GPU memory - Total: " + (total[0] * (1e-6)) + " MB, 
Available: " + (free[0] * (1e-6)) + " MB on "
                                + this);
 
@@ -269,7 +269,7 @@ public class GPUContext {
                                freeCUDASpaceMap.remove(size);
                        if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
                                GPUStatistics
-                                               
.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_REUSE, 
System.nanoTime() - t0);
+                               .maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_REUSE, System.nanoTime() - t0);
                } else {
                        LOG.trace(
                                        "GPU : in allocate from instruction " + 
instructionName + ", allocating new block of size " + (size
@@ -280,9 +280,9 @@ public class GPUContext {
                        A = new Pointer();
                        cudaMalloc(A, size);
                        if (DMLScript.STATISTICS)
-                               
GPUStatistics.cudaAllocTime.getAndAdd(System.nanoTime() - t0);
+                               
GPUStatistics.cudaAllocTime.add(System.nanoTime() - t0);
                        if (DMLScript.STATISTICS)
-                               
GPUStatistics.cudaAllocCount.getAndAdd(statsCount);
+                               GPUStatistics.cudaAllocCount.add(statsCount);
                        if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
                                
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_ALLOCATE,
                                                System.nanoTime() - t0);
@@ -298,9 +298,9 @@ public class GPUContext {
                if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
+                       GPUStatistics.cudaMemSet0Time.add(end - t1);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaMemSet0Count.getAndAdd(1);
+                       GPUStatistics.cudaMemSet0Count.add(1);
                cudaBlockSizeMap.put(A, size);
                return A;
 
@@ -349,32 +349,32 @@ public class GPUContext {
                long t0 = 0;
                assert cudaBlockSizeMap.containsKey(
                                toFree) : "ERROR : Internal state corrupted, 
cache block size map is not aware of a block it trying to free up";
-               long size = cudaBlockSizeMap.get(toFree);
-               if (eager) {
-                       LOG.trace("GPU : eagerly freeing cuda memory [ " + 
toFree + " ] for instruction " + instructionName + " on "
-                                       + this);
-                       if (DMLScript.STATISTICS)
-                               t0 = System.nanoTime();
-                       cudaFree(toFree);
-                       cudaBlockSizeMap.remove(toFree);
-                       if (DMLScript.STATISTICS)
-                               
GPUStatistics.cudaDeAllocTime.addAndGet(System.nanoTime() - t0);
-                       if (DMLScript.STATISTICS)
-                               GPUStatistics.cudaDeAllocCount.addAndGet(1);
-                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
-                               
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE,
-                                               System.nanoTime() - t0);
-               } else {
-                       LOG.trace("GPU : lazily freeing cuda memory for 
instruction " + instructionName + " on " + this);
-                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
-                       if (freeList == null) {
-                               freeList = new LinkedList<Pointer>();
-                               freeCUDASpaceMap.put(size, freeList);
-                       }
-                       if (freeList.contains(toFree))
-                               throw new RuntimeException("GPU : Internal 
state corrupted, double free");
-                       freeList.add(toFree);
-               }
+                               long size = cudaBlockSizeMap.get(toFree);
+                               if (eager) {
+                                       LOG.trace("GPU : eagerly freeing cuda 
memory [ " + toFree + " ] for instruction " + instructionName + " on "
+                                                       + this);
+                                       if (DMLScript.STATISTICS)
+                                               t0 = System.nanoTime();
+                                       cudaFree(toFree);
+                                       cudaBlockSizeMap.remove(toFree);
+                                       if (DMLScript.STATISTICS)
+                                               
GPUStatistics.cudaDeAllocTime.add(System.nanoTime() - t0);
+                                       if (DMLScript.STATISTICS)
+                                               
GPUStatistics.cudaDeAllocCount.add(1);
+                                       if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS)
+                                               
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_CUDA_FREE,
+                                                               
System.nanoTime() - t0);
+                               } else {
+                                       LOG.trace("GPU : lazily freeing cuda 
memory for instruction " + instructionName + " on " + this);
+                                       LinkedList<Pointer> freeList = 
freeCUDASpaceMap.get(size);
+                                       if (freeList == null) {
+                                               freeList = new 
LinkedList<Pointer>();
+                                               freeCUDASpaceMap.put(size, 
freeList);
+                                       }
+                                       if (freeList.contains(toFree))
+                                               throw new RuntimeException("GPU 
: Internal state corrupted, double free");
+                                       freeList.add(toFree);
+                               }
        }
 
        /**
@@ -426,7 +426,7 @@ public class GPUContext {
         */
        protected void evict(String instructionName, final long neededSize) 
throws DMLRuntimeException {
                LOG.trace("GPU : evict called from " + instructionName + " for 
size " + neededSize + " on " + this);
-               GPUStatistics.cudaEvictionCount.addAndGet(1);
+               GPUStatistics.cudaEvictionCount.add(1);
                // Release the set of free blocks maintained in a 
GPUObject.freeCUDASpaceMap
                // to free up space
                LRUCacheMap<Long, LinkedList<Pointer>> lruCacheMap = 
freeCUDASpaceMap;

http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 94ceb36..c3e23f3 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -32,9 +32,7 @@ import static 
jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
 import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
 
 import java.util.Arrays;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
@@ -100,12 +98,12 @@ public class GPUObject {
        /**
         * number of read/write locks on this object (this GPUObject is being 
used in a current instruction)
         */
-       protected AtomicInteger locks = new AtomicInteger(0);
+       protected AtomicLong locks = new AtomicLong();
 
        /**
         * Timestamp, needed by {@link GPUContext#evict(long)}
         */
-       AtomicLong timestamp = new AtomicLong(0);
+       AtomicLong timestamp = new AtomicLong();
 
        /**
         * Whether this block is in sparse format
@@ -131,7 +129,7 @@ public class GPUObject {
                        that.allocateTensorDescriptor(me.tensorShape[0], 
me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]);
                }
                that.dirty = me.dirty;
-               that.locks = new AtomicInteger(me.locks.get());
+               that.locks = new AtomicLong(me.locks.get());
                that.timestamp = new AtomicLong(me.timestamp.get());
                that.isSparse = me.isSparse;
 
@@ -238,9 +236,9 @@ public class GPUObject {
                        t2 = System.nanoTime();
                cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, 
getIntSizeOf(1), cudaMemcpyDeviceToHost);
                if (DMLScript.STATISTICS)
-                       
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - t2);
+                       GPUStatistics.cudaFromDevTime.add(System.nanoTime() - 
t2);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaFromDevCount.addAndGet(1);
+                       GPUStatistics.cudaFromDevCount.add(1);
 
                if (nnzC[0] == -1) {
                        throw new DMLRuntimeException(
@@ -353,9 +351,9 @@ public class GPUObject {
                                                cols));
                // TODO: What if mat.getNnz() is -1 ?
                if (DMLScript.STATISTICS)
-                       
GPUStatistics.cudaDenseToSparseTime.addAndGet(System.nanoTime() - t0);
+                       
GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaDenseToSparseCount.addAndGet(1);
+                       GPUStatistics.cudaDenseToSparseCount.add(1);
        }
 
        /**
@@ -430,9 +428,9 @@ public class GPUObject {
                if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS)
                        GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, end - start);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaSparseToDenseTime.addAndGet(end - 
start);
+                       GPUStatistics.cudaSparseToDenseTime.add(end - start);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaSparseToDenseCount.addAndGet(1);
+                       GPUStatistics.cudaSparseToDenseCount.add(1);
        }
 
        /**
@@ -560,8 +558,8 @@ public class GPUObject {
                // If the fill value is 0, no need to call the special kernel, 
the allocate memsets the allocated region to 0
                if (v != 0)
                        getGPUContext().getKernels()
-                                       .launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
-                                                       
getJcudaDenseMatrixPtr(), v, numElems);
+                       .launchKernel("fill", 
ExecutionConfig.getConfigForSimpleVectorOperations(numElems),
+                                       getJcudaDenseMatrixPtr(), v, numElems);
        }
 
        /**
@@ -657,12 +655,12 @@ public class GPUObject {
         * @throws DMLRuntimeException if there is no locked GPU Object or if 
could not obtain a {@link GPUContext}
         */
        private void updateReleaseLocks(int l) throws DMLRuntimeException {
-               int newLocks = locks.addAndGet(l);
+               int newLocks = (int) locks.addAndGet(l);
                if (newLocks < 0) {
                        throw new CacheException("Internal state error : 
Invalid number of locks on a GPUObject");
                }
 
-               LOG.trace("GPU : updateReleaseLocks, new number of locks is " + 
locks.get() + ", on " + this + ", GPUContext="
+               LOG.trace("GPU : updateReleaseLocks, new number of locks is " + 
newLocks + ", on " + this + ", GPUContext="
                                + getGPUContext());
                GPUContext.EvictionPolicy evictionPolicy = 
getGPUContext().evictionPolicy;
                switch (evictionPolicy) {
@@ -802,18 +800,18 @@ public class GPUObject {
                                        csrBlock = new 
SparseBlockCSR(toIntExact(mat.getNumRows()), cooBlock.rowIndexes(),
                                                        cooBlock.indexes(), 
cooBlock.values());
                                        if (DMLScript.STATISTICS)
-                                               
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+                                               
GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
                                        if (DMLScript.STATISTICS)
-                                               
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+                                               
GPUStatistics.cudaSparseConversionCount.increment();
                                } else if (block instanceof SparseBlockMCSR) {
                                        if (DMLScript.STATISTICS)
                                                t0 = System.nanoTime();
                                        SparseBlockMCSR mcsrBlock = 
(SparseBlockMCSR) block;
                                        csrBlock = new 
SparseBlockCSR(mcsrBlock.getRows(), toIntExact(mcsrBlock.size()));
                                        if (DMLScript.STATISTICS)
-                                               
GPUStatistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0);
+                                               
GPUStatistics.cudaSparseConversionTime.add(System.nanoTime() - t0);
                                        if (DMLScript.STATISTICS)
-                                               
GPUStatistics.cudaSparseConversionCount.incrementAndGet();
+                                               
GPUStatistics.cudaSparseConversionCount.increment();
                                } else {
                                        throw new 
DMLRuntimeException("Unsupported sparse matrix format for CUDA operations");
                                }
@@ -848,9 +846,9 @@ public class GPUObject {
                mat.release();
 
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaToDevTime.addAndGet(System.nanoTime() 
- start);
+                       GPUStatistics.cudaToDevTime.add(System.nanoTime() - 
start);
                if (DMLScript.STATISTICS)
-                       GPUStatistics.cudaToDevCount.addAndGet(1);
+                       GPUStatistics.cudaToDevCount.add(1);
        }
 
        public static int toIntExact(long l) throws DMLRuntimeException {
@@ -882,9 +880,9 @@ public class GPUObject {
                        mat.release();
 
                        if (DMLScript.STATISTICS)
-                               
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
+                               
GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
                        if (DMLScript.STATISTICS)
-                               GPUStatistics.cudaFromDevCount.addAndGet(1);
+                               GPUStatistics.cudaFromDevCount.add(1);
                } else if (getJcudaSparseMatrixPtr() != null) {
                        if (!LibMatrixCUDA.isInSparseFormat(getGPUContext(), 
mat))
                                throw new DMLRuntimeException(
@@ -912,9 +910,9 @@ public class GPUObject {
                                mat.acquireModify(tmp);
                                mat.release();
                                if (DMLScript.STATISTICS)
-                                       
GPUStatistics.cudaFromDevTime.addAndGet(System.nanoTime() - start);
+                                       
GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start);
                                if (DMLScript.STATISTICS)
-                                       
GPUStatistics.cudaFromDevCount.addAndGet(1);
+                                       GPUStatistics.cudaFromDevCount.add(1);
                        }
                } else {
                        throw new DMLRuntimeException(

Reply via email to