[SYSTEMML-445] Refactored GPU Memory Manager - Several bugfixes found during recent experiments with ResNet200. - Added align_memory eviction policy. - Added GPU usage documentation. - Refactored the GPU Memory Manager into distinct components.
Closes #774. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4d321667 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4d321667 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4d321667 Branch: refs/heads/master Commit: 4d3216678f252f731ac7d7db62111dde6ca063f5 Parents: af9cc8a Author: Niketan Pansare <[email protected]> Authored: Thu May 31 12:25:06 2018 -0700 Committer: Niketan Pansare <[email protected]> Committed: Thu May 31 12:26:44 2018 -0700 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 14 +- docs/gpu.md | 94 + src/main/cpp/kernels/SystemML.cu | 99 +- src/main/cpp/kernels/SystemML.ptx | 8587 +++++++++--------- .../java/org/apache/sysml/api/DMLScript.java | 4 +- .../apache/sysml/api/ScriptExecutorUtils.java | 3 + .../java/org/apache/sysml/conf/DMLConfig.java | 10 +- .../controlprogram/caching/CacheableData.java | 4 +- .../instructions/gpu/context/CSRPointer.java | 21 +- .../instructions/gpu/context/GPUContext.java | 58 +- .../context/GPULazyCudaFreeMemoryManager.java | 171 + .../gpu/context/GPUMatrixMemoryManager.java | 184 + .../gpu/context/GPUMemoryManager.java | 654 +- .../instructions/gpu/context/GPUObject.java | 322 +- .../runtime/matrix/data/LibMatrixCUDA.java | 130 +- .../runtime/matrix/data/LibMatrixCuDNN.java | 54 +- .../LibMatrixCuDNNConvolutionAlgorithm.java | 4 +- .../data/LibMatrixCuDNNInputRowFetcher.java | 2 +- .../runtime/matrix/data/LibMatrixCuMatMult.java | 4 +- .../runtime/matrix/data/LibMatrixNative.java | 2 +- .../SinglePrecisionCudaSupportFunctions.java | 45 +- .../org/apache/sysml/utils/GPUStatistics.java | 37 +- 22 files changed, 5646 insertions(+), 4857 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index a9c73c8..05d6a1a 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -85,17 +85,23 @@ <sysml.gpu.availableGPUs>-1</sysml.gpu.availableGPUs> <!-- whether to synchronize GPUs after every GPU instruction --> - <sysml.gpu.sync.postProcess>true</sysml.gpu.sync.postProcess> + <sysml.gpu.sync.postProcess>false</sysml.gpu.sync.postProcess> <!-- whether to perform eager CUDA free on rmvar instruction --> <sysml.gpu.eager.cudaFree>false</sysml.gpu.eager.cudaFree> + + <!-- Developer flag used to debug GPU memory leaks. This has huge performance overhead and should be only turned on for debugging purposes. --> + <sysml.gpu.print.memoryInfo>false</sysml.gpu.print.memoryInfo> <!-- the floating point precision. supported values are double, single --> <sysml.floating.point.precision>double</sysml.floating.point.precision> - <!-- the eviction policy for the GPU bufferpool. supported values are lru, mru, lfu, min_evict --> - <sysml.gpu.eviction.policy>lru</sysml.gpu.eviction.policy> + <!-- the eviction policy for the GPU bufferpool. supported values are lru, mru, lfu, min_evict, align_memory --> + <sysml.gpu.eviction.policy>align_memory</sysml.gpu.eviction.policy> <!-- maximum wrap length for instruction and miscellaneous timer column of statistics --> <sysml.stats.maxWrapLength>30</sysml.stats.maxWrapLength> -</root> + + <!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) --> + <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize> +</root> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/docs/gpu.md ---------------------------------------------------------------------- diff --git a/docs/gpu.md b/docs/gpu.md new file mode 100644 index 0000000..e9d7bca --- /dev/null +++ b/docs/gpu.md @@ -0,0 +1,94 @@ +--- +layout: global +title: Using SystemML with GPU +description: Using SystemML with GPU +--- +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +--> + +* This will become a table of contents (this text will be scraped). +{:toc} + +<br/> + +# User Guide + +To use SystemML on GPUs, please ensure that [CUDA 9](https://developer.nvidia.com/cuda-90-download-archive) and +[CuDNN 7](https://developer.nvidia.com/cudnn) is installed on your system. + +## Python users + +Please install SystemML using pip: +- For released version: `pip install systemml` +- For bleeding edge version: `pip install https://sparktc.ibmcloud.com/repo/latest/systemml-1.2.0-SNAPSHOT-python.tar.gz` + +Then you can use the `setGPU(True)` method of [MLContext](http://apache.github.io/systemml/spark-mlcontext-programming-guide.html) and +[MLLearn](http://apache.github.io/systemml/beginners-guide-python.html#invoke-systemmls-algorithms) APIs to enable the GPU usage. + +```python +from systemml.mllearn import Caffe2DML +lenet = Caffe2DML(spark, solver='lenet_solver.proto', input_shape=(1, 28, 28)) +lenet.setGPU(True) +``` +To skip memory-checking and force all GPU-enabled operations on the GPU, please use the `setForceGPU(True)` method after `setGPU(True)` method. + +```python +from systemml.mllearn import Caffe2DML +lenet = Caffe2DML(spark, solver='lenet_solver.proto', input_shape=(1, 28, 28)) +lenet.setGPU(True).setForceGPU(True) +``` + +## Command-line users + +To enable the GPU backend via command-line, please provide `systemml-1.*-extra.jar` in the classpath and `-gpu` flag. + +``` +spark-submit --jars systemml-1.*-extra.jar SystemML.jar -f myDML.dml -gpu +``` + +To skip memory-checking and force all GPU-enabled operations on the GPU, please provide `force` option to the `-gpu` flag. + +``` +spark-submit --jars systemml-1.*-extra.jar SystemML.jar -f myDML.dml -gpu force +``` + +## Scala users + +To enable the GPU backend via command-line, please provide `systemml-1.*-extra.jar` in the classpath and use +the `setGPU(True)` method of [MLContext](http://apache.github.io/systemml/spark-mlcontext-programming-guide.html) API to enable the GPU usage. + +``` +spark-shell --jars systemml-1.*-extra.jar,SystemML.jar +``` + +# Troubleshooting guide + +- If you have older gcc (< 5.0) and if you get `libstdc++.so.6: version CXXABI_1.3.8 not found` error, please upgrade to gcc v5+. +On Centos 5, you may have to compile gcc from the source: + +``` +sudo yum install libmpc-devel mpfr-devel gmp-devel zlib-devel* +curl ftp://ftp.gnu.org/pub/gnu/gcc/gcc-5.3.0/gcc-5.3.0.tar.bz2 -O +tar xvfj gcc-5.3.0.tar.bz2 +cd gcc-5.3.0 +./configure --with-system-zlib --disable-multilib --enable-languages=c,c++ +num_cores=`grep -c ^processor /proc/cpuinfo` +make -j $num_cores +sudo make install +``` \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index 29ae820..55ebeaf 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -20,7 +20,7 @@ /********************************** When updating a kernel or adding a new one, please compile the ptx file and commit it: -nvcc -ptx -arch=sm_30 --std c++11 SystemML.cu +nvcc -w -ptx -arch=sm_30 --std c++11 SystemML.cu ***********************************/ #include <cfloat> @@ -1961,3 +1961,100 @@ extern "C" __global__ void matrix_sigmoid_f(float *A, float *C, unsigned int size) { matrix_sigmoid(A, C, size); } + +// We can later fold it in our reduce method +template <typename T> +__device__ void compute_nnz( + T *g_idata, ///< input data stored in device memory (of size n) + T *g_odata, ///< output/temporary array stored in device memory (of size n) + unsigned int n) ///< size of the input and temporary/output arrays +{ + // extern __shared__ T sdata[]; + extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[]; + T *sdata = reinterpret_cast<T *>(my_sdata); + + // perform first level of reduction, + // reading from global memory, writing to shared memory + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x; + unsigned int gridSize = blockDim.x * 2 * gridDim.x; + + T v = 0; + + // we reduce multiple elements per thread. The number is determined by the + // number of active thread blocks (via gridDim). More blocks will result + // in a larger gridSize and therefore fewer elements per thread + while (i < n) { + v += g_idata[i] != 0 ? 1 : 0; + // ensure we don't read out of bounds + if (i + blockDim.x < n) v += g_idata[i + blockDim.x] != 0 ? 1 : 0; + i += gridSize; + } + + // each thread puts its local sum into shared memory + sdata[tid] = v; + __syncthreads(); + + // do reduction in shared mem + if (blockDim.x >= 1024) { + if (tid < 512) { + sdata[tid] = v = v + sdata[tid + 512]; + } + __syncthreads(); + } + if (blockDim.x >= 512) { + if (tid < 256) { + sdata[tid] = v = v + sdata[tid + 256]; + } + __syncthreads(); + } + if (blockDim.x >= 256) { + if (tid < 128) { + sdata[tid] = v = v + sdata[tid + 128]; + } + __syncthreads(); + } + if (blockDim.x >= 128) { + if (tid < 64) { + sdata[tid] = v = v + sdata[tid + 64]; + } + __syncthreads(); + } + + if (tid < 32) { + // now that we are using warp-synchronous programming (below) + // we need to declare our shared memory volatile so that the compiler + // doesn't reorder stores to it and induce incorrect behavior. + volatile T *smem = sdata; + if (blockDim.x >= 64) { + smem[tid] = v = v + smem[tid + 32]; + } + if (blockDim.x >= 32) { + smem[tid] = v = v + smem[tid + 16]; + } + if (blockDim.x >= 16) { + smem[tid] = v = v + smem[tid + 8]; + } + if (blockDim.x >= 8) { + smem[tid] = v = v + smem[tid + 4]; + } + if (blockDim.x >= 4) { + smem[tid] = v = v + smem[tid + 2]; + } + if (blockDim.x >= 2) { + smem[tid] = v = v + smem[tid + 1]; + } + } + + // write result for this block to global mem + if (tid == 0) g_odata[blockIdx.x] = sdata[0]; +} + + +extern "C" __global__ void compute_nnz_d(double *g_idata, double *g_odata, unsigned int n) { + compute_nnz(g_idata, g_odata, n); +} + +extern "C" __global__ void compute_nnz_f(float *g_idata, float *g_odata, unsigned int n) { + compute_nnz(g_idata, g_odata, n); +}
