SINGA-100 Implement layers using CUDNN for GPU training
Support compiling with cudnn related code.
Please run:
./configure --enable-cuda --with-cuda=/CUDA/PATH --enable-cudnn
--with-cudnn=/CUDNN/PATH
to generate makefile.
Fix a bug that CPU compilation will detect nvcc existence.
There exists 1 failed test when running "make test", caused by
src/test/test_math.cc:349.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/15b23a62
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/15b23a62
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/15b23a62
Branch: refs/heads/master
Commit: 15b23a62f9bb4fbcf53e0b5db446886d997e6e30
Parents: f8be9af
Author: xiezl <[email protected]>
Authored: Fri Dec 11 15:49:18 2015 +0800
Committer: xiezl <[email protected]>
Committed: Fri Dec 11 15:49:18 2015 +0800
----------------------------------------------------------------------
Makefile.am | 66 +++++++++++++++++++++++++++++++++++++---------
configure.ac | 51 +++++++++++++++++++++++++++++------
src/test/test_math.cc | 2 +-
3 files changed, 98 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15b23a62/Makefile.am
----------------------------------------------------------------------
diff --git a/Makefile.am b/Makefile.am
index 470ea8a..1959fb6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -27,7 +27,7 @@ AUTOMAKE_OPTIONS = foreign subdir-objects
MSHADOW_FLAGS = -DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
DEFAULT_FLAGS = -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
- $(MSHADOW_FLAGS) -DCPU_ONLY=1 -funroll-loops -DTHREADED
+ $(MSHADOW_FLAGS) -funroll-loops -DTHREADED
CFLAGS = $(DEBUG)
CXXFLAGS = $(DEBUG)
@@ -49,8 +49,17 @@ PROTO_PYS := tool/python/pb2/singa_pb2.py \
tool/python/pb2/common_pb2.py
CUDA_SRCS := src/utils/math_kernel.cu
+CUDA_HDRS := include/singa/utils/math_kernel.h
-PY_SRCS := tool/python/singa/driver_wrap.cxx
+CUDNN_SRCS := src/neuralnet/loss_layer/cudnn_softmaxloss.cc \
+ src/neuralnet/neuron_layer/cudnn_softmax.cc \
+ src/neuralnet/neuron_layer/cudnn_pooling.cc \
+ src/neuralnet/neuron_layer/cudnn_activation.cc \
+ src/neuralnet/neuron_layer/cudnn_lrn.cc \
+ src/neuralnet/neuron_layer/cudnn_convolution.cc
+
+
+PY_SRCS := tool/python/singa/driver_wrap.cxx \
src/driver.cc
SINGA_SRCS := src/driver.cc \
@@ -103,7 +112,9 @@ SINGA_SRCS := src/driver.cc \
src/utils/image_transform.cc
SINGA_HDRS := include/singa.h \
- include/utils/cluster.h \
+ include/singa/utils/math_blob.h \
+ include/singa/utils/math_addr.h \
+ include/singa/utils/cluster.h \
include/utils/cluster_rt.h \
include/utils/param.h \
include/utils/common.h \
@@ -170,14 +181,20 @@ py_LTLIBRARIES = $(PY_PROGS)
#lib_LTLIBRARIES = libsinga.la
libsinga_la_SOURCES = $(PROTO_SRCS) $(SINGA_SRCS)
libsinga_la_CXXFLAGS = $(DEFAULT_FLAGS) -msse3 -fpermissive
-I$(top_srcdir)/include
+libsinga_la_LDFLAGS =
if LMDB
libsinga_la_CXXFLAGS += -DUSE_LMDB
endif
#libsinga_la_LDFLAGS = -I$(top_srcdir)/include
if DCUDA
-libsinga_la_SOURCES += $(CUDA_SRCS)
+libsinga_la_SOURCES += $(CUDA_SRCS) $(CUDA_HDRS)
libsinga_la_CXXFLAGS += $(CUDA_CFLAGS)
-libsinga_la_LDFLAGS = $(CUDA_LDFLAGS) $(CUDA_LIBS)
+libsinga_la_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS)
+endif
+if DCUDNN
+libsinga_la_SOURCES += $(CUDNN_SRCS)
+libsinga_la_CXXFLAGS += $(CUDNN_CFLAGS)
+libsinga_la_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
endif
@@ -197,19 +214,34 @@ singa_LDFLAGS += -llmdb
endif
if DCUDA
-singa_SOURCES += $(CUDA_SRCS)
+singa_SOURCES += $(CUDA_SRCS) $(CUDA_HDRS)
singa_CXXFLAGS += $(CUDA_CFLAGS)
singa_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS)
endif
+if DCUDNN
+singa_SOURCES += $(CUDNN_SRCS)
+singa_CXXFLAGS += $(CUDNN_CFLAGS)
+singa_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
+endif
#bin_PROGRAMS += singatool
-singatool_SOURCES = src/utils/tool.cc
+singatool_SOURCES = src/utils/tool.cc #$(CUDA_SRCS) $(CUDA_HDRS) $(CUDNN_SRCS)
singatool_CXXFLAGS = -Wall -pthread -fPIC -std=c++11 -MMD -Wno-unknown-pragmas
\
- -funroll-loops -DTHREADED -I$(top_srcdir)/include
+ -funroll-loops -DTHREADED
-I$(top_srcdir)/include $(DEFAULT_FLAGS)
singatool_LDFLAGS = -lsinga \
-lglog \
-lprotobuf \
- -lzookeeper_mt
+ -lzookeeper_mt
+if DCUDA
+singatool_SOURCES += $(CUDA_SRCS) $(CUDA_HDRS)
+singatool_CXXFLAGS += $(CUDA_CFLAGS)
+singatool_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS)
+endif
+if DCUDNN
+singatool_SOURCES += $(CUDNN_SRCS)
+singatool_CXXFLAGS += $(CUDNN_CFLAGS)
+singatool_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
+endif
#lib_LTLIBRARIES += libgtest.la
libgtest_la_SOURCES = $(GTEST_HDRS) $(GTEST_SRCS)
@@ -236,21 +268,31 @@ singatest_LDFLAGS = -lsinga \
if LMDB
singatest_LDFLAGS += -llmdb
endif
-if DCUDA
-singatest_SOURCES += $(CUDA_SRCS)
+if DCUDA
+singatest_SOURCES += $(CUDA_SRCS) $(CUDA_HDRS)
singatest_CXXFLAGS += $(CUDA_CFLAGS)
singatest_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS)
endif
+if DCUDNN
+singatest_SOURCES += $(CUDNN_SRCS)
+singatest_CXXFLAGS += $(CUDNN_CFLAGS)
+singatest_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
+endif
_driver_la_SOURCES = $(PY_SRCS)
_driver_la_CXXFLAGS = $(DEFAULT_FLAGS) $(MSHADOW_FLAGS)
-I$(top_srcdir)/include $(PYFLAGS)
_driver_la_LDFLAGS = -lsinga -module -shared $(PYLIBS) -avoid-version -rpath
$(pydir)
if DCUDA
-_driver_la_CXXFLAGS += $(CUDA_CFLAGS)
+_driver_la_CXXFLAGS += $(CUDA_CFLAGS)
_driver_la_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS)
endif
+if DCUDNN
+_driver_la_CXXFLAGS += $(CUDNN_CFLAGS)
+_driver_la_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
+endif
+
clean-local:
rm -rf $(PROTO_SRCS) $(PROTO_HDRS)
rm -rf $(PROTO_PYS)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15b23a62/configure.ac
----------------------------------------------------------------------
diff --git a/configure.ac b/configure.ac
index 902beab..85cc5fb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -55,16 +55,16 @@ if test "$cuda_prefix" == "yes"; then
fi
fi
-AC_MSG_CHECKING([nvcc in $cuda_prefix/bin])
-if test -x "$cuda_prefix/bin/nvcc"; then
- AC_MSG_RESULT([found])
- AC_DEFINE_UNQUOTED([NVCC_PATH], ["$cuda_prefix/bin/nvcc"], [Path to nvcc
binary])
-else
- AC_MSG_RESULT([not found!])
- AC_MSG_FAILURE([nvcc was not found in $cuda_prefix/bin])
-fi
if test x"$cudaval" = x"yes"; then
+ AC_MSG_CHECKING([nvcc in $cuda_prefix/bin])
+ if test -x "$cuda_prefix/bin/nvcc"; then
+ AC_MSG_RESULT([found])
+ AC_DEFINE_UNQUOTED([NVCC_PATH], ["$cuda_prefix/bin/nvcc"], [Path to
nvcc binary])
+ else
+ AC_MSG_RESULT([not found!])
+ AC_MSG_FAILURE([nvcc was not found in $cuda_prefix/bin])
+ fi
CUDA_CFLAGS="-I$cuda_prefix/include"
CUDA_LDFLAGS="-L$cuda_prefix/lib64 -L$cuda_prefix/lib"
CUDA_LIBS="-lcublas -lcudart -lcurand"
@@ -85,12 +85,47 @@ else
CUDA_LDFLAGS=""
CUDA_LIBS=""
NVCC=""
+ DEBUG="-DCPU_ONLY"
fi
AC_SUBST(NVCC)
AC_SUBST(CUDA_LDFLAGS)
AC_SUBST(CUDA_LIBS)
AC_SUBST(CUDA_CFLAGS)
+# Setup custom CUDA paths
+AC_ARG_ENABLE([cudnn],
+ [AS_HELP_STRING(--enable-cudnn,enable CUDNN support)],
+ [enable_cudnn=yes], [enable_cudnn=no])
+AM_CONDITIONAL(DCUDNN, [test "$enable_cudnn" = "yes"])
+
+AC_ARG_WITH([cudnn],
+ [AS_HELP_STRING([--with-cudnn=PATH], [prefix where CUDNN is installed])],
+ [cudnn_prefix=$cudnnwithval], [cudnn_prefix="/usr/local/cuda"])
+if test "$cudnn_prefix" == "yes"; then
+ if test "$cudnnwithval" == "yes"; then
+ cudnn_prefix="/usr/local/cuda"
+ fi
+fi
+
+if test x"$enable_cudnn" == x"yes"; then
+ CUDNN_CFLAGS="-I$cudnn_prefix/include"
+ CUDNN_LDFLAGS="-L$cudnn_prefix/lib64 -L$cudnn_prefix/lib"
+ CUDNN_LIBS="-lcudnn"
+ DEBUG+=" -DUSE_CUDNN "
+ AC_DEFINE(DCUDNN,[1],[Defined if CUDNN should be used])
+ AC_CHECK_LIB([cudnn], [main], [], [
+ AC_MSG_ERROR([unable to find cudnn library])
+ ])
+else
+ CUDNN_CFLAGS=""
+ CUDNN_LDFLAGS=""
+ CUDNN_LIBS=""
+fi
+
+AC_SUBST(CUDNN_CFLAGS)
+AC_SUBST(CUDNN_LDFLAGS)
+AC_SUBST(CUDNN_LIBS)
+
# Checks for libraries.
AC_SEARCH_LIBS([cblas_sgemm], [openblas], [], [
AC_MSG_ERROR([unable to find cblas_sgemm() function])
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15b23a62/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
index c2730a4..0b9f0ff 100644
--- a/src/test/test_math.cc
+++ b/src/test/test_math.cc
@@ -335,7 +335,7 @@ TEST(MathTest, TestSingaSumColGPU) {
cudaMalloc(reinterpret_cast<void**>(&A_gpu), 12*sizeof(float));
cudaMalloc(reinterpret_cast<void**>(&B_gpu), 4*sizeof(float));
cudaMemcpy(A_gpu, A, 12*sizeof(float), cudaMemcpyHostToDevice);
- singa_gpu_sum_by_col(A_gpu, B_gpu, 3, 4, 4);
+ //singa_gpu_sum_row(A_gpu, B_gpu, 3, 4, 4);
cudaMemcpy(B, B_gpu, 4*sizeof(float), cudaMemcpyDeviceToHost);