[incubator-mxnet] branch v1.6.x updated: [MKLDNN] Add LSTMP to v1.6.x (#17959)

patriczhao Fri, 10 Apr 2020 20:26:45 -0700

This is an automated email from the ASF dual-hosted git repository.

patriczhao pushed a commit to branch v1.6.x
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/v1.6.x by this push:
     new ec42cc0  [MKLDNN] Add LSTMP to v1.6.x (#17959)
ec42cc0 is described below

commit ec42cc09af3a1b68356c7fc9bcd58cc6b8ac2a0c
Author: Zixuan Wei <[email protected]>
AuthorDate: Sat Apr 11 11:25:02 2020 +0800

    [MKLDNN] Add LSTMP to v1.6.x (#17959)
    
    * Add LSTMP to v1.6.x
    
    * Upgrade dnnl(mkldnn) to its v1.4 version
    
    * Change some statement of `mkldnn` to `dnnl`
    
    * Implement LSTMP from both native and DNNL pass
    
    * Add unit test for LSTMP
    
    * Add env var to control the initializing behavior of RNN
    
    * Remain input dim as int type in dnnl pass
    
    * Add logging message for MXNET_RNN_USE_WEIGHT_CACHE=1
    
    * CMake link dnnl rather than mkldnn
    
    * Fix lint
    
    * libmkldnn -> libdnnl
    
    * mkldnn_version.h -> dnnl_version.h
    
    * Update mkldnn to formal r1.4-rc
    
    * sync remote url
    
    * checkout r1.4-rc commit
---
 .gitignore                                         |   3 +-
 .gitmodules                                        |   2 +-
 3rdparty/mkldnn                                    |   2 +-
 CMakeLists.txt                                     |   2 +-
 Makefile                                           |   8 +-
 cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy       |   2 +-
 cd/mxnet_lib/static/Jenkins_pipeline.groovy        |   2 +-
 cd/python/pypi/pypi_package.sh                     |   6 +-
 ci/docker/runtime_functions.sh                     |   6 +-
 ci/jenkins/Jenkins_steps.groovy                    |   4 +-
 docs/static_site/src/pages/api/faq/env_var.md      |   4 +
 include/mkldnn/dnnl.h                              |   1 +
 include/mkldnn/dnnl.hpp                            |   1 +
 include/mkldnn/dnnl_types.h                        |   1 +
 include/mkldnn/mkldnn.hpp                          |   1 +
 include/mkldnn/mkldnn_dnnl_mangling.h              |   1 +
 include/mkldnn/mkldnn_version.h                    |   1 +
 mkldnn.mk                                          |   9 +-
 python/mxnet/gluon/rnn/rnn_layer.py                |   1 +
 .../assembly/src/main/assembly/assembly.xml        |   4 +-
 .../apache/mxnet/util/NativeLibraryLoader.scala    |   4 +-
 src/operator/nn/mkldnn/mkldnn_rnn-inl.h            |  60 +++++--
 src/operator/nn/mkldnn/mkldnn_rnn.cc               | 194 +++++++++++++--------
 src/operator/rnn-inl.h                             |  33 +++-
 src/operator/rnn.cc                                |  42 +++--
 src/operator/rnn_impl.h                            |  53 ++++--
 tests/cpp/operator/mkldnn_test.cc                  |   2 +-
 tests/nightly/JenkinsfileForBinaries               |   2 +-
 tests/python/mkl/test_mkldnn_install.py            |   2 +-
 tests/python/unittest/test_gluon_rnn.py            | 101 +++++++++++
 tools/pip/setup.py                                 |   8 +-
 tools/staticbuild/build_lib.sh                     |   4 +-
 32 files changed, 411 insertions(+), 155 deletions(-)

diff --git a/.gitignore b/.gitignore
index 94cc9ba..eb3225f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,4 +164,5 @@ coverage.xml
 cmake_options.yml
 
 # header file generated at compile time
-include/mkldnn/mkldnn_version.h
+include/mkldnn/dnnl_version.h
+include/mkldnn/dnnl_config.h
diff --git a/.gitmodules b/.gitmodules
index 90ef157..ceda2cf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -15,7 +15,7 @@
        url = https://github.com/google/googletest.git
 [submodule "3rdparty/mkldnn"]
        path = 3rdparty/mkldnn
-       url = https://github.com/intel/mkl-dnn.git
+       url = https://github.com/oneapi-src/oneDNN.git
        branch = master
 [submodule "3rdparty/tvm"]
        path = 3rdparty/tvm
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index a0a87d6..1b05a28 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit a0a87d662edeef38d01db4ac5dd25f59a1f0881f
+Subproject commit 1b05a28eb9666efef83b281e4cc1936db5e6cf6c
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12bc195..8e06d22 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -293,7 +293,7 @@ if(USE_MKLDNN)
   include_directories(3rdparty/mkldnn/include)
   include_directories(${PROJECT_BINARY_DIR}/3rdparty/mkldnn/include)
   add_definitions(-DMXNET_USE_MKLDNN=1)
-  list(APPEND mxnet_LINKER_LIBS mkldnn)
+  list(APPEND mxnet_LINKER_LIBS dnnl)
 endif()
 
 # Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
diff --git a/Makefile b/Makefile
index 2b65152..4efa6ad 100644
--- a/Makefile
+++ b/Makefile
@@ -151,7 +151,7 @@ ifeq ($(USE_MKLDNN), 1)
        CFLAGS += -DMXNET_USE_MKLDNN=1
        CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/
        CFLAGS += -I$(MKLDNNROOT)/include
-       LDFLAGS += -L$(MKLDNNROOT)/lib -L$(MKLDNNROOT)/lib64 -lmkldnn 
-Wl,-rpath,'$${ORIGIN}'
+       LDFLAGS += -L$(MKLDNNROOT)/lib -L$(MKLDNNROOT)/lib64 -ldnnl 
-Wl,-rpath,'$${ORIGIN}'
 endif
 
 # setup opencv
@@ -597,7 +597,7 @@ lib/libmxnet.so: $(ALLX_DEP)
        -Wl,${WHOLE_ARCH} $(filter %libnnvm.a, $^) -Wl,${NO_WHOLE_ARCH}
 ifeq ($(USE_MKLDNN), 1)
 ifeq ($(UNAME_S), Darwin)
-       install_name_tool -change '@rpath/libmkldnn.1.dylib' 
'@loader_path/libmkldnn.1.dylib' $@
+       install_name_tool -change '@rpath/libdnnl.1.dylib' 
'@loader_path/libdnnl.1.dylib' $@
 endif
 endif
 
@@ -689,8 +689,8 @@ rpkg:
        cp src/io/image_recordio.h R-package/src
        cp -rf lib/libmxnet.so R-package/inst/libs
 
-       if [ -e "lib/libmkldnn.so.1" ]; then \
-               cp -rf lib/libmkldnn.so.1 R-package/inst/libs; \
+       if [ -e "lib/libdnnl.so.1" ]; then \
+               cp -rf lib/libdnnl.so.1 R-package/inst/libs; \
        fi
 
        if [ -e "lib/libtvm_runtime.so" ]; then \
diff --git a/cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy 
b/cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy
index af68314..f912fac 100644
--- a/cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy
+++ b/cd/mxnet_lib/dynamic/Jenkins_pipeline.groovy
@@ -30,7 +30,7 @@ licenses = 'licenses/*'
 
 // libmxnet dependencies
 mx_deps = ''
-mx_mkldnn_deps = 'lib/libmkldnn.so.1'
+mx_mkldnn_deps = 'lib/libdnnl.so.1'
 
 // library type
 // either static or dynamic - depending on how it links to its dependencies
diff --git a/cd/mxnet_lib/static/Jenkins_pipeline.groovy 
b/cd/mxnet_lib/static/Jenkins_pipeline.groovy
index ac2e450..4c2c3ed 100644
--- a/cd/mxnet_lib/static/Jenkins_pipeline.groovy
+++ b/cd/mxnet_lib/static/Jenkins_pipeline.groovy
@@ -31,7 +31,7 @@ licenses = 'licenses/*'
 
 // libmxnet dependencies
 mx_deps = 'lib/libgfortran.so.3, lib/libquadmath.so.0'
-mx_mkldnn_deps = 'lib/libgfortran.so.3, lib/libquadmath.so.0, 
lib/libmkldnn.so.1, 3rdparty/mkldnn/build/install/include/mkldnn_version.h'
+mx_mkldnn_deps = 'lib/libgfortran.so.3, lib/libquadmath.so.0, 
lib/libdnnl.so.1, 3rdparty/mkldnn/build/install/include/dnnl_version.h'
 
 // library type
 // either static or dynamic - depending on how it links to its dependencies
diff --git a/cd/python/pypi/pypi_package.sh b/cd/python/pypi/pypi_package.sh
index 581bf69..39137ae 100755
--- a/cd/python/pypi/pypi_package.sh
+++ b/cd/python/pypi/pypi_package.sh
@@ -22,13 +22,13 @@ set -ex
 export mxnet_variant=${1:?"Please specify the mxnet variant"}
 
 # Due to this PR: https://github.com/apache/incubator-mxnet/pull/14899
-# The setup.py expects that mkldnn_version.h be present in 
+# The setup.py expects that dnnl_version.h be present in 
 # mxnet-build/3rdparty/mkldnn/build/install/include
 # The artifact repository stores this file in the dependencies
 # and CD unpacks it to a directory called cd_misc
-if [ -f "cd_misc/mkldnn_version.h" ]; then
+if [ -f "cd_misc/dnnl_version.h" ]; then
   mkdir -p 3rdparty/mkldnn/build/install/include
-  cp cd_misc/mkldnn_version.h 3rdparty/mkldnn/build/install/include/.
+  cp cd_misc/dnnl_version.h 3rdparty/mkldnn/build/install/include/.
 fi
 
 # Create wheel workspace
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index fcea6c1..d5de024 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -876,9 +876,9 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
-    # libmkldnn.so.1 is a link file. We need an actual binary file named 
libmkldnn.so.1.
-    cp 3rdparty/mkldnn/src/libmkldnn.so.1 
3rdparty/mkldnn/src/libmkldnn.so.1.tmp
-    mv 3rdparty/mkldnn/src/libmkldnn.so.1.tmp 
3rdparty/mkldnn/src/libmkldnn.so.1
+    # libdnnl.so.1 is a link file. We need an actual binary file named 
libdnnl.so.1.
+    cp 3rdparty/mkldnn/src/libdnnl.so.1 3rdparty/mkldnn/src/libdnnl.so.1.tmp
+    mv 3rdparty/mkldnn/src/libdnnl.so.1.tmp 3rdparty/mkldnn/src/libdnnl.so.1
 }
 
 build_ubuntu_gpu_cmake() {
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 0770320..be66350 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -35,8 +35,8 @@ mx_cmake_lib_no_tvm_op = 'build/libmxnet.so, 
build/libmxnet.a, build/libsample_l
 mx_cmake_lib_cython = 'build/libmxnet.so, build/libmxnet.a, 
build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, 
build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, 
build/3rdparty/openmp/runtime/src/libomp.so, python/mxnet/_cy2/*.so, 
python/mxnet/_cy3/*.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static 
library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, 
build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, 
build/libsample_lib.so, build/3rdparty/dmlc-core/libdmlc.a, 
build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, 
build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, 
build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, 
build/3rdparty/openmp/runtime/src/libomp.so, 
build/3rdparty/mkldnn/src/libmkldnn.so.1'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, 
lib/libtvmop.so, lib/tvmop.conf, libsample_lib.so, lib/libmkldnn.so.1, 
3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, 
build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, 
build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, 
build/3rdparty/openmp/runtime/src/libomp.so, 
build/3rdparty/mkldnn/src/libdnnl.so.1'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, 
lib/libtvmop.so, lib/tvmop.conf, libsample_lib.so, lib/libdnnl.so.1, 
3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'build/libmxnet.so, build/3rdparty/tvm/libtvm_runtime.so, 
build/libtvmop.so, build/tvmop.conf, lib/libnvonnxparser_runtime.so.0, 
lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, 
lib/libtvmop.so, lib/tvmop.conf, libsample_lib.so, 
3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 
3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, 
build/cpp-package/example/*, python/mxnet/_cy2/*.so, python/mxnet/_cy3/*.so'
 mx_lib_cpp_examples_no_tvm_op = 'lib/libmxnet.so, lib/libmxnet.a, 
libsample_lib.so, 3rdparty/dmlc-core/libdmlc.a, 
3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, 
deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*, 
python/mxnet/_cy2/*.so, python/mxnet/_cy3/*.so'
diff --git a/docs/static_site/src/pages/api/faq/env_var.md 
b/docs/static_site/src/pages/api/faq/env_var.md
index d63da61..add0a96 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -347,6 +347,10 @@ If ctypes is used, it must be 
`mxnet._ctypes.ndarray.NDArrayBase`.
   - Values: 0(false) or 1(true) ```(default=1)```
   - This variable controls whether to use the MKL-DNN backend in fused RNN 
operator for CPU context. There are two fusion implementations of RNN operator 
in MXNet. The MKL-DNN implementation has a better performance than the naive 
one, but the latter is more stable in the backward operation currently.
 
+* MXNET_RNN_USE_WEIGHT_CACHE
+  - Values: 0(false) or 1(true) ```(default=0)```
+  - If this variable is set, MXNet will ignore the altering of the version of 
NDArray which is the input parameter of the RNN operator. In Gluon API, there 
is a `_rnn_param_concat` operator concatenating the weights and bias of RNN 
into a single parameter tensor that changes the version number. Since the 
values of the parameters are invariant in inference pass, the RNN operator 
could ignore the altering of the version to escape much overhead from 
re-initializing the parameters.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/include/mkldnn/dnnl.h b/include/mkldnn/dnnl.h
new file mode 120000
index 0000000..44625f5
--- /dev/null
+++ b/include/mkldnn/dnnl.h
@@ -0,0 +1 @@
+../../3rdparty/mkldnn/include/dnnl.h
\ No newline at end of file
diff --git a/include/mkldnn/dnnl.hpp b/include/mkldnn/dnnl.hpp
new file mode 120000
index 0000000..4dfc038
--- /dev/null
+++ b/include/mkldnn/dnnl.hpp
@@ -0,0 +1 @@
+../../3rdparty/mkldnn/include/dnnl.hpp
\ No newline at end of file
diff --git a/include/mkldnn/dnnl_types.h b/include/mkldnn/dnnl_types.h
new file mode 120000
index 0000000..750b64c
--- /dev/null
+++ b/include/mkldnn/dnnl_types.h
@@ -0,0 +1 @@
+../../3rdparty/mkldnn/include/dnnl_types.h
\ No newline at end of file
diff --git a/include/mkldnn/mkldnn.hpp b/include/mkldnn/mkldnn.hpp
new file mode 120000
index 0000000..2cb212a
--- /dev/null
+++ b/include/mkldnn/mkldnn.hpp
@@ -0,0 +1 @@
+../../3rdparty/mkldnn/include/mkldnn.hpp
\ No newline at end of file
diff --git a/include/mkldnn/mkldnn_dnnl_mangling.h 
b/include/mkldnn/mkldnn_dnnl_mangling.h
new file mode 120000
index 0000000..876ad64
--- /dev/null
+++ b/include/mkldnn/mkldnn_dnnl_mangling.h
@@ -0,0 +1 @@
+../../3rdparty/mkldnn/include/mkldnn_dnnl_mangling.h
\ No newline at end of file
diff --git a/include/mkldnn/mkldnn_version.h b/include/mkldnn/mkldnn_version.h
new file mode 120000
index 0000000..76927f2
--- /dev/null
+++ b/include/mkldnn/mkldnn_version.h
@@ -0,0 +1 @@
+../../3rdparty/mkldnn/include/mkldnn_version.h
\ No newline at end of file
diff --git a/mkldnn.mk b/mkldnn.mk
index aa92108..9781d7a 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -21,9 +21,9 @@ ifeq ($(USE_MKLDNN), 1)
        MXNET_LIBDIR = $(ROOTDIR)/lib
        MXNET_INCLDIR = $(ROOTDIR)/include
 ifeq ($(UNAME_S), Darwin)
-       MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.1.dylib
+       MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libdnnl.1.dylib
 else
-       MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.1
+       MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libdnnl.so.1
 endif
 endif
 
@@ -38,10 +38,13 @@ $(MKLDNN_LIBFILE):
        $(MAKE) -C $(MKLDNN_BUILDDIR) install
        mkdir -p $(MXNET_LIBDIR)
        cp $(MKLDNN_LIBFILE) $(MXNET_LIBDIR)
-       cp $(MKLDNN_BUILDDIR)/include/mkldnn_version.h $(MXNET_INCLDIR)/mkldnn/.
+       cp $(MKLDNN_BUILDDIR)/include/dnnl_version.h $(MXNET_INCLDIR)/mkldnn/.
+       cp ${MKLDNN_BUILDDIR}/include/dnnl_config.h ${MXNET_INCLDIR}/mkldnn/.
 
 mkldnn_clean:
        $(RM) -r 3rdparty/mkldnn/build
+       ${RM} -r include/mkldnn/dnnl_version.h
+       ${RM} -r include/mkldnn/dnnl_config.h
 
 ifeq ($(USE_MKLDNN), 1)
 mkldnn: mkldnn_build
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py 
b/python/mxnet/gluon/rnn/rnn_layer.py
index f4489b7..cff9d5f 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -61,6 +61,7 @@ class _RNNLayer(HybridBlock):
         self._lstm_state_clip_nan = lstm_state_clip_nan
         self._dtype = dtype
         self._use_sequence_length = use_sequence_length
+        self.skip_states = None
 
         self._gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
 
diff --git a/scala-package/assembly/src/main/assembly/assembly.xml 
b/scala-package/assembly/src/main/assembly/assembly.xml
index 060a97b..ae3bfb6 100644
--- a/scala-package/assembly/src/main/assembly/assembly.xml
+++ b/scala-package/assembly/src/main/assembly/assembly.xml
@@ -57,8 +57,8 @@
         <include>libtvm_runtime.so</include>
         <include>libgfortran.so.3</include>
         <include>libquadmath.so.0</include>
-        <include>libmkldnn.so.1</include>
-        <include>libmkldnn.1.dylib</include>
+        <include>libdnnl.so.1</include>
+        <include>libdnnl.1.dylib</include>
       </includes>
       <outputDirectory>lib/native</outputDirectory>
     </fileSet>
diff --git 
a/scala-package/core/src/main/scala/org/apache/mxnet/util/NativeLibraryLoader.scala
 
b/scala-package/core/src/main/scala/org/apache/mxnet/util/NativeLibraryLoader.scala
index 49e5d68..855ab30 100644
--- 
a/scala-package/core/src/main/scala/org/apache/mxnet/util/NativeLibraryLoader.scala
+++ 
b/scala-package/core/src/main/scala/org/apache/mxnet/util/NativeLibraryLoader.scala
@@ -89,8 +89,8 @@ private[mxnet] object NativeLibraryLoader {
     saveLibraryToTemp("libtvm_runtime.so", "/lib/native/libtvm_runtime.so", 
false)
     saveLibraryToTemp("libgfortran.so.3", "/lib/native/libgfortran.so.3", 
false)
     saveLibraryToTemp("libquadmath.so.0", "/lib/native/libquadmath.so.0", 
false)
-    saveLibraryToTemp("libmkldnn.so.1", "/lib/native/libmkldnn.so.1", false)
-    saveLibraryToTemp("libmkldnn.1.dylib", "/lib/native/libmkldnn.1.dylib", 
false)
+    saveLibraryToTemp("libdnnl.so.1", "/lib/native/libdnnl.so.1", false)
+    saveLibraryToTemp("libdnnl.1.dylib", "/lib/native/libdnnl.1.dylib", false)
     val tempfile: File = saveLibraryToTemp(libname, libFileInJar, true)
 
     loadLibraryFromFile(libname, tempfile)
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h 
b/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
index a4104bf..7e02fc2 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
@@ -47,14 +47,17 @@ struct MKLDNNRnnLayerParam {
   int batch_size;
   int input_size;
   int state_size;
+  int proj_size;
   int seq_len;
 
   dims src_dims;           // Dimensions of source input in format_tag::tnc
   dims weight_layer_dims;  // Dimensions of layer weights in format_tag::ldigo
   dims weight_iter_dims;   // Dimensions of iter weights in format_tag::ldigo
+  dims weight_proj_dims;   // Dimensions of projection weights in 
format_tag::ldio
   dims bias_dims;          // Dimensions of bias in format_tag::ldgo
   dims dst_dims;           // Dimensions of output in format_tag::tnc
   dims state_dims;         // Dimensions of the state cell in format_tag::ldnc
+  dims cell_dims;          // Dimensions of LSTM cell state in format_tag::ldnc
 
   size_t workspace_size;  // used for the cached mkl-dnn memory in Forward 
inference
   size_t reserve_size;    // used for the reserved cached memory in Backward
@@ -64,11 +67,11 @@ struct MKLDNNRnnLayerParam {
   size_t single_state_size;    // state size of a single cell, hy, cy
 
   MKLDNNRnnLayerParam(int num_layer, int batch_size, int seq_len,
-                      int input_size, int state_size,
+                      int input_size, int state_size, int proj_size,
                       int mode, bool bidirectional = true)
       : mode(mode), bidirectional(bidirectional), state_outputs(true),
         num_layer(num_layer), batch_size(batch_size), input_size(input_size),
-        state_size(state_size), seq_len(seq_len) { }
+        state_size(state_size), proj_size(proj_size), seq_len(seq_len) { }
 
   void SetDims();
 };
@@ -100,7 +103,15 @@ class MKLDNNRnnMemMgr {
   std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
 
  public:
-  void Init(dim_t size, const Context& ctx, int dtype = mshadow::kFloat32);
+  /*!
+   * \brief Initializer for RNN memory manager
+   * \param size byte number
+   * \param ctx Context of device enviroment
+   */
+  void Init(dim_t size, const Context& ctx);
+
+  // Return the bytes number of the buffer
+  const size_t Size() { return mem_size; }
 
   void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
     mem_holder.push_back(mem);
@@ -129,6 +140,7 @@ class RnnPrimitive {
     auto fwd_pd = reinterpret_cast<typename 
rnn_fwd::primitive_desc*>(rnn_fwd_prim.fwd_pd_.get());
     rnn_fwd_prim.weights_layer_desc_ = fwd_pd->weights_layer_desc();
     rnn_fwd_prim.weights_iter_desc_  = fwd_pd->weights_iter_desc();
+    rnn_fwd_prim.weights_proj_desc_  = fwd_pd->weights_projection_desc();
     rnn_fwd_prim.workspace_desc_ = fwd_pd->workspace_desc();
 
     rnn_fwd_prim.primitive_ = std::shared_ptr<mkldnn::primitive>(new 
rnn_fwd(*fwd_pd));
@@ -141,6 +153,7 @@ class RnnPrimitive {
     this->primitive_ = nullptr;
     this->weights_layer_desc_ = mkldnn::memory::desc();
     this->weights_iter_desc_ = mkldnn::memory::desc();
+    this->weights_proj_desc_ = mkldnn::memory::desc();
     this->workspace_desc_ = mkldnn::memory::desc();
   }
 
@@ -149,6 +162,7 @@ class RnnPrimitive {
     this->primitive_ = rnn_fwd_prim.primitive_;
     this->weights_layer_desc_ = rnn_fwd_prim.weights_layer_desc_;
     this->weights_iter_desc_ = rnn_fwd_prim.weights_iter_desc_;
+    this->weights_proj_desc_ = rnn_fwd_prim.weights_proj_desc_;
     this->workspace_desc_ = rnn_fwd_prim.workspace_desc_;
   }
 
@@ -158,6 +172,7 @@ class RnnPrimitive {
       this->primitive_ = rnn_fwd_prim.primitive_;
       this->weights_layer_desc_ = rnn_fwd_prim.weights_layer_desc_;
       this->weights_iter_desc_ = rnn_fwd_prim.weights_iter_desc_;
+      this->weights_proj_desc_ = rnn_fwd_prim.weights_proj_desc_;
       this->workspace_desc_ = rnn_fwd_prim.workspace_desc_;
     }
 
@@ -175,6 +190,10 @@ class RnnPrimitive {
     return weights_iter_desc_;
   }
 
+  const mkldnn::memory::desc& GetProjDesc() const {
+    return weights_proj_desc_;
+  }
+
   const mkldnn::memory::desc& GetWorkspaceDesc() const {
     return workspace_desc_;
   }
@@ -184,6 +203,7 @@ class RnnPrimitive {
   std::shared_ptr<mkldnn::primitive> primitive_;
   mkldnn::memory::desc weights_layer_desc_;
   mkldnn::memory::desc weights_iter_desc_;
+  mkldnn::memory::desc weights_proj_desc_;
   mkldnn::memory::desc workspace_desc_;
 };
 
@@ -195,27 +215,29 @@ RnnPrimitive GetRnnFwdPrim(const MKLDNNRnnLayerParam 
&layer_param, const bool is
  */
 class MKLDNNRnnForward {
  public:
-  MKLDNNRnnForward(const MKLDNNRnnLayerParam &layer_param, const bool is_train,
-                   const NDArray &data, const NDArray &params)
-      : initialized_(false), param_(layer_param),
+  MKLDNNRnnForward(const Context ctx,
+                   const MKLDNNRnnLayerParam &layer_param,
+                   const bool is_train,
+                   const NDArray &data,
+                   const NDArray &params)
+      : ctx_(ctx), initialized_(false), param_(layer_param),
         fwd_inf_(GetRnnFwdPrim(layer_param, false, data, params)) { }
 
   void SetNewDataMem(void* x, void* hx, void* cx,
                      void* y, void* hy, void* cy,
                      const int dtype = mshadow::kFloat32);
-  void SetWeightsMem(MKLDNNRnnMemMgr* mgr, void* w_ptr, void* b_ptr,
+  void SetWeightsMem(void* w_ptr, void* b_ptr,
                      const bool is_train = false,
                      const int dtype = mshadow::kFloat32);
   void ReorderWeights();
 
   const mkldnn::primitive& GetFwd() const { return fwd_inf_.GetPrim(); }
 
-  const size_t GetSize(int dtype) const {
-    size_t bytes = mshadow::mshadow_sizeof(dtype);
-    size_t size = 0;
-    size += fwd_inf_.GetLayerDesc().get_size();
-    size += fwd_inf_.GetIterDesc().get_size();
-    return size / bytes + 1;
+  const size_t GetSize() const {
+    const size_t size = fwd_inf_.GetLayerDesc().get_size()
+                        + fwd_inf_.GetIterDesc().get_size()
+                        + fwd_inf_.GetProjDesc().get_size();
+    return size;
   }
 
   const MKLDNNRnnLayerParam &GetParam() const { return param_; }
@@ -226,16 +248,20 @@ class MKLDNNRnnForward {
   void Reset() { initialized_ = false; }
 
  private:
+  Context ctx_;
   bool initialized_;
   MKLDNNRnnLayerParam param_;
   RnnPrimitive fwd_inf_;    // forward inference primitive
 
+  MKLDNNRnnMemMgr mem_mgr_;
   mkldnn::memory *weights_layer_ = nullptr;
   mkldnn::memory *weights_iter_ = nullptr;
+  mkldnn::memory *weights_proj_ = nullptr;
   mkldnn::memory *bias_ = nullptr;
 
   mkldnn::memory *weights_layer_r_ = nullptr;
   mkldnn::memory *weights_iter_r_ = nullptr;
+  mkldnn::memory *weights_proj_r_ = nullptr;
 
   /*
    * net_args must contain some keys as below:
@@ -378,7 +404,6 @@ class MKLDNNRnnBackward {
   const mkldnn_args_map_t& GetArgsMap() const { return net_args_; }
 
  private:
-  bool initialized_;
   RnnBwdPrimitive bwd_;
   const MKLDNNRnnForwardTraining* fwd_ptr_;
 
@@ -441,6 +466,13 @@ class MKLDNNRnnOp {
             const std::vector<NDArray> &outputs);
 };
 
+inline bool SupportMKLDNNRnn(const int input_dtype) {
+  if (input_dtype == mshadow::kFloat32 && dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 
1)) {
+    return true;
+  }
+  return false;
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/mkldnn/mkldnn_rnn.cc 
b/src/operator/nn/mkldnn/mkldnn_rnn.cc
index 8af0e99..29e3f2e 100644
--- a/src/operator/nn/mkldnn/mkldnn_rnn.cc
+++ b/src/operator/nn/mkldnn/mkldnn_rnn.cc
@@ -53,18 +53,22 @@ void MKLDNNRnnLayerParam::SetDims() {
   const int nbias = mode == rnn_enum::kGru ? (ngates + 1) : ngates;
   const int num_direction = bidirectional ? 2 : 1;
 
+  const int iter_size = proj_size < 0 ? state_size : proj_size;
   src_dims.assign({seq_len, batch_size, input_size});
   weight_layer_dims.assign({num_layer, num_direction, input_size, ngates, 
state_size});
-  weight_iter_dims.assign({num_layer, num_direction, state_size, ngates, 
state_size});
+  weight_iter_dims.assign({num_layer, num_direction, iter_size, ngates, 
state_size});
+  weight_proj_dims.assign({num_layer, num_direction, state_size, iter_size});
   bias_dims.assign({num_layer, num_direction, nbias, state_size});
-  dst_dims.assign({seq_len, batch_size, state_size * num_direction});
-  state_dims.assign({num_layer, num_direction, batch_size, state_size});
+  dst_dims.assign({seq_len, batch_size, iter_size * num_direction});
+  state_dims.assign({num_layer, num_direction, batch_size, iter_size});
+  cell_dims.assign({num_layer, num_direction, batch_size, state_size});
 
   // unidirectional size of a single cell
-  single_w_size = (input_size + state_size) * ngates * state_size;
+  single_w_size = (input_size + iter_size) * ngates * state_size;
+  if (proj_size > 0) single_w_size += state_size * proj_size;
   single_b_size = nbias * state_size;
   naive_single_b_size = ngates * state_size * 2;  // naive RNN variants have 
double bias
-  single_state_size = batch_size * state_size;
+  single_state_size = batch_size * iter_size;
 
   // Get workspace size for cached weights memory
   // multiplication of tensor dimensions
@@ -75,6 +79,7 @@ void MKLDNNRnnLayerParam::SetDims() {
 
   workspace_size = tz_volume(weight_layer_dims) + tz_volume(weight_iter_dims) +
       tz_volume(bias_dims);
+  if (proj_size > 0) workspace_size += tz_volume(weight_proj_dims);
   reserve_size = 0;
 }
 
@@ -82,7 +87,11 @@ MKLDNNRnnFullParam MKLDNNRnnFullParamParser(const RNNParam& 
rnn_param, const int
                                             const int batch_size, const int 
input_size) {
   MKLDNNRnnFullParam full_param;
   full_param.default_param = rnn_param;
-  size_t state_size = rnn_param.state_size;
+  const int state_size = rnn_param.state_size;
+  const int proj_size = rnn_param.projection_size.has_value() ?
+      rnn_param.projection_size.value() : -1;
+  const int iter_size = rnn_param.projection_size.has_value() ?
+      rnn_param.projection_size.value() : state_size;
   LayerParamVector &layer_params = full_param.layer_params;
 
   full_param.default_param.seq_length_ = seq_len;
@@ -90,20 +99,21 @@ MKLDNNRnnFullParam MKLDNNRnnFullParamParser(const RNNParam& 
rnn_param, const int
   full_param.default_param.input_size_ = input_size;
   // Set basic size by constructing MKLDNNRnnLayerParam instance(s)
   if (rnn_param.bidirectional) {  // unfused bidirectional multi-layer RNN
-    layer_params.emplace_back(1, batch_size, seq_len, input_size, state_size, 
rnn_param.mode);
+    layer_params.emplace_back(1, batch_size, seq_len, input_size, state_size, 
proj_size,
+        rnn_param.mode);
     for (size_t layer = 1; layer < rnn_param.num_layers; ++layer) {
-      layer_params.emplace_back(1, batch_size, seq_len, state_size * 2, 
state_size,
+      layer_params.emplace_back(1, batch_size, seq_len, iter_size * 2, 
state_size, proj_size,
           rnn_param.mode);
     }
-  } else if (input_size == static_cast<int>(state_size)) {  // fused 
multi-layer RNN
+  } else if (input_size == iter_size) {  // fused multi-layer
     layer_params.emplace_back(rnn_param.num_layers, batch_size, seq_len, 
input_size,
-        state_size, rnn_param.mode, false);
-  } else {  // unfused 1st layer, plus fused 2-end layers
-    layer_params.emplace_back(1, batch_size, seq_len, input_size, state_size, 
rnn_param.mode,
-        false);
+        state_size, proj_size, rnn_param.mode, false);
+  } else {  // unfused 1st layer, plus fused 2~end layers
+    layer_params.emplace_back(1, batch_size, seq_len, input_size, state_size, 
proj_size,
+        rnn_param.mode, false);
     if (rnn_param.num_layers > 1)
-      layer_params.emplace_back(rnn_param.num_layers - 1, batch_size, seq_len, 
state_size,
-          state_size, rnn_param.mode, false);
+      layer_params.emplace_back(rnn_param.num_layers - 1, batch_size, seq_len, 
iter_size,
+          state_size, proj_size, rnn_param.mode, false);
   }
 
   // Set dims, workspace size, and state_outputs flag
@@ -114,11 +124,13 @@ MKLDNNRnnFullParam MKLDNNRnnFullParamParser(const 
RNNParam& rnn_param, const int
   return full_param;
 }
 
-void MKLDNNRnnMemMgr::Init(dim_t size, const Context& ctx, int dtype) {
-  workspace_ = NDArray(TShape({size}), ctx, false, dtype);
+void MKLDNNRnnMemMgr::Init(dim_t size, const Context& ctx) {
+  workspace_ = NDArray(TShape({size}), ctx, false, mshadow::kUint8);
+  if (workspace_.data().dptr_ == nullptr)
+    LOG(FATAL) << "MKLDNN RNN operator memory allocation error.";
   curr_mem = static_cast<char *>(workspace_.data().dptr_);
-  mem_size = size * mshadow::mshadow_sizeof(dtype);
-  curr_size = size * mshadow::mshadow_sizeof(dtype);
+  mem_size = size;
+  curr_size = size;
 }
 
 mkldnn::memory *MKLDNNRnnMemMgr::Alloc(const mkldnn::memory::desc &md) {
@@ -162,16 +174,22 @@ RnnPrimitive GetRnnFwdPrim(
   auto bias_desc         = memory::desc(layer_param.bias_dims, data_type, 
tag::ldgo);
   auto dst_layer_desc    = memory::desc(layer_param.dst_dims, data_type, 
tag::tnc);
   auto src_state_desc    = memory::desc(layer_param.state_dims, data_type, 
tag::ldnc);
+  auto src_cell_desc     = memory::desc(layer_param.cell_dims, data_type, 
tag::ldnc);
+  auto weight_peep_desc  = memory::desc();
+  auto weight_proj_desc = layer_param.proj_size > 0 ? memory::desc(
+      layer_param.weight_proj_dims, weight_type, tag::any) : memory::desc();
   auto dst_state_desc = layer_param.state_outputs ? memory::desc(
       layer_param.state_dims, data_type, tag::ldnc) : memory::desc();
+  auto dst_cell_desc = layer_param.state_outputs ? memory::desc(
+      layer_param.cell_dims, data_type, tag::ldnc) : memory::desc();
 
   auto fwd = RnnPrimitive();
   switch (mode) {
     case rnn_enum::kLstm:
       fwd = RnnPrimitive::Create<lstm_forward>(prop, mkldnn_rnn_direction,
-          src_layer_desc, src_state_desc, src_state_desc, weight_layer_desc,
-          weight_iter_desc, bias_desc, dst_layer_desc, dst_state_desc,
-          dst_state_desc);
+          src_layer_desc, src_state_desc, src_cell_desc, weight_layer_desc,
+          weight_iter_desc, weight_peep_desc, weight_proj_desc, bias_desc,
+          dst_layer_desc, dst_state_desc, dst_cell_desc);
       break;
     case rnn_enum::kGru:
       fwd = RnnPrimitive::Create<lbr_gru_forward>(prop, mkldnn_rnn_direction,
@@ -287,7 +305,7 @@ static void ConcatWeights(const mkldnn::memory &dst,
   const memory::desc& dst_desc = dst.get_desc();
   // Use dst memory dims to initialize src memory dims, then set the concat
   // dim to 1. And Rnn weights are 5-dimension tensor.
-  memory::dims src_dims(dst_desc.data.dims, dst_desc.data.dims + 5);
+  memory::dims src_dims(dst_desc.data.dims, dst_desc.data.dims + 
dst_desc.data.ndims);
   src_dims.at(concat_dimension) = 1;
   std::vector<memory::desc> src_descs;
   std::unordered_map<int, memory> concat_args;
@@ -339,7 +357,6 @@ FUNC(MKLDNN_ARG_DIFF_##NAME, 
ARGS.at(MKLDNN_ARG_##NAME).get_desc(), HANDLE)
 void MKLDNNRnnForward::SetNewDataMem(void* x, void* hx, void* cx,
                                      void* y, void* hy, void* cy,
                                      const int dtype) {
-  using dims = mkldnn::memory::dims;
   using desc = mkldnn::memory::desc;
   using format_tag = mkldnn::memory::format_tag;
   auto& cpu_engine = CpuEngine::Get()->get_engine();
@@ -357,9 +374,9 @@ void MKLDNNRnnForward::SetNewDataMem(void* x, void* hx, 
void* cx,
   }
 
   if (param_.mode == rnn_enum::kLstm) {
-    RNN_FWD_SET(SRC_ITER_C, param_.state_dims, format_tag::ldnc, cx, dtype);
+    RNN_FWD_SET(SRC_ITER_C, param_.cell_dims, format_tag::ldnc, cx, dtype);
     if (param_.state_outputs) {
-      RNN_FWD_SET(DST_ITER_C, param_.state_dims, format_tag::ldnc, cy, dtype);
+      RNN_FWD_SET(DST_ITER_C, param_.cell_dims, format_tag::ldnc, cy, dtype);
     }
   }
 }
@@ -396,6 +413,7 @@ inline void MKLDNNMemoryReorder(const mkldnn::memory& src,
 void MKLDNNRnnForward::ReorderWeights() {
   MKLDNNMemoryReorder(*weights_layer_r_, *weights_layer_);
   MKLDNNMemoryReorder(*weights_iter_r_, *weights_iter_);
+  if (param_.proj_size > 0) MKLDNNMemoryReorder(*weights_proj_r_, 
*weights_proj_);
 }
 
 void AdjustGruGateOrder(char* weight,
@@ -470,66 +488,92 @@ inline void EmplaceNetArgs(mkldnn_args_map_t* net_args, 
const int arg_name,
  * memory with preferred format_tag. Finally, naive bias is fused to MKLDNN
  * bias memory.
  */
-void MKLDNNRnnForward::SetWeightsMem(MKLDNNRnnMemMgr* mgr, void *w_ptr, void 
*b_ptr,
+void MKLDNNRnnForward::SetWeightsMem(void *w_ptr, void *b_ptr,
                                      const bool is_train, const int dtype) {
   using format_tag = mkldnn::memory::format_tag;
   auto mkldnn_dtype = get_mkldnn_type(dtype);
+  const size_t dtype_bytes = mshadow::mshadow_sizeof(dtype);
+
+  const size_t buffer_bytes = this->GetSize()  // byte number of the buffer
+      + (param_.workspace_size + param_.reserve_size) * dtype_bytes
+      + kMKLDNNAlign * 7;     // Add margin for alignment of seven times 
allocation for the
+                              // dnnl memory handlers, i.e. weights_layer_, 
weights_iter_,
+                              // weights_proj_, bias_, weights_layer_r_, 
weights_iter_r_,
+                              // and weights_proj_r_.
+  if (mem_mgr_.Size() < buffer_bytes) mem_mgr_.Init(buffer_bytes, this->ctx_);
+
+  const bool use_proj = (param_.proj_size > 0);
   // Get the weights' memory for RNN forward primitive
   if (weights_layer_ == nullptr) {
-    weights_layer_ = mgr->Alloc(fwd_inf_.GetLayerDesc());
+    weights_layer_ = mem_mgr_.Alloc(fwd_inf_.GetLayerDesc());
   }
   if (weights_iter_ == nullptr) {
-    weights_iter_ = mgr->Alloc(fwd_inf_.GetIterDesc());
+    weights_iter_ = mem_mgr_.Alloc(fwd_inf_.GetIterDesc());
+  }
+  if (use_proj && weights_proj_ == nullptr) {
+    weights_proj_ = mem_mgr_.Alloc(fwd_inf_.GetProjDesc());
   }
   if (bias_ == nullptr) {
-    bias_ = mgr->Alloc(
+    bias_ = mem_mgr_.Alloc(
         {param_.bias_dims, mkldnn_dtype, format_tag::ldgo});
   }
 
   // Get the intermediate memory for weights concat & reorder
   if (weights_layer_r_ == nullptr) {
-    weights_layer_r_ = mgr->Alloc(
+    weights_layer_r_ = mem_mgr_.Alloc(
         {param_.weight_layer_dims, mkldnn_dtype, format_tag::ldgoi});
   }
   if (weights_iter_r_ == nullptr) {
-    weights_iter_r_ = mgr->Alloc(
+    weights_iter_r_ = mem_mgr_.Alloc(
         {param_.weight_iter_dims, mkldnn_dtype, format_tag::ldgoi});
   }
-
-  // Get the bytes of a real type
-  size_t dtype_bytes = mshadow::mshadow_sizeof(dtype);
+  if (use_proj && weights_proj_r_ == nullptr) {
+    weights_proj_r_ = mem_mgr_.Alloc(
+        {param_.weight_proj_dims, mkldnn_dtype, format_tag::ldoi});
+  }
 
   // convert void* to char* for arithmetic operations
+  const size_t iter_size = use_proj ? param_.proj_size : param_.state_size;
   char *weights_ptr = static_cast<char *>(w_ptr);
   size_t wx_bytes = GetRnnGatesNum(param_.mode) * param_.state_size *
         param_.input_size * dtype_bytes;  //* DIMS: ngates x state_size x 
input_size
   size_t wh_bytes = GetRnnGatesNum(param_.mode) * param_.state_size *
-        param_.state_size * dtype_bytes;  //* DIMS: ngates x state_size x 
state_size
+        iter_size * dtype_bytes;  //* DIMS: ngates x state_size x state_size, 
if not use projection.
+                                  // With projection, DIMS is ngates x 
state_size x projection_size
+  size_t wr_bytes = param_.state_size * iter_size * dtype_bytes;
   char *l2r_wx = weights_ptr;
   char *l2r_wh = l2r_wx + wx_bytes;       //* DIMS: ngates x state_size * 
state_size
+  char *l2r_wr = l2r_wh + wh_bytes;       //* DIMS: ngates x state_size * 
iter_size
 
   if (param_.num_layer == 1 && param_.bidirectional) {
     //* single bidirectinal layer, concat weights on direction axis
     char *r2l_wx = weights_ptr + param_.single_w_size * dtype_bytes;
-    char *r2l_wh = r2l_wx + wx_bytes;  //* DIMS: ngates x state_size * 
state_size
+    char *r2l_wh = r2l_wx + wx_bytes;  //* DIMS: ngates x state_size x 
state_size
+    char *r2l_wr = r2l_wh + wh_bytes;  //* DIMS: ngates x state_size x 
iter_size
     ConcatWeights(*weights_layer_r_, 1, {l2r_wx, r2l_wx}, format_tag::ldgoi);
     ConcatWeights(*weights_iter_r_, 1, {l2r_wh, r2l_wh}, format_tag::ldgoi);
+    if (use_proj) ConcatWeights(*weights_proj_r_, 1, {l2r_wr, r2l_wr}, 
format_tag::ldoi);
   } else if (param_.num_layer == 1 && !param_.bidirectional) {
     //* single uni-directional layer, no concatenate operator needed
     std::memcpy(weights_layer_r_->get_data_handle(), l2r_wx, wx_bytes);
     std::memcpy(weights_iter_r_->get_data_handle(), l2r_wh, wh_bytes);
+    if (use_proj) std::memcpy(weights_proj_r_->get_data_handle(), l2r_wr, 
wr_bytes);
   } else if (param_.num_layer > 1 && !param_.bidirectional) {
     //* concat fused multi-layer weights on layer axis
     std::vector<void *> l2r_wx_ptrs;
     std::vector<void *> l2r_wh_ptrs;
+    std::vector<void *> l2r_wr_ptrs;
     for (int lyr = 0; lyr < param_.num_layer; ++lyr) {
       char *lth_wx = l2r_wx + lyr * param_.single_w_size * dtype_bytes;
       char *lth_wh = lth_wx + wx_bytes;
+      char *lth_wr = lth_wh + wh_bytes;
       l2r_wx_ptrs.push_back(lth_wx);
       l2r_wh_ptrs.push_back(lth_wh);
+      if (use_proj) l2r_wr_ptrs.push_back(lth_wr);
     }
     ConcatWeights(*weights_layer_r_, 0, l2r_wx_ptrs, format_tag::ldgoi);
     ConcatWeights(*weights_iter_r_, 0, l2r_wh_ptrs, format_tag::ldgoi);
+    if (use_proj) ConcatWeights(*weights_proj_r_, 0, l2r_wr_ptrs, 
format_tag::ldoi);
   } else {
     LOG(FATAL) << "Undifined RNN fusion workflow for num_layer = " << 
param_.num_layer
                << ", and bidirectional is " << param_.bidirectional;
@@ -566,6 +610,7 @@ void MKLDNNRnnForward::SetWeightsMem(MKLDNNRnnMemMgr* mgr, 
void *w_ptr, void *b_
   EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_WEIGHTS_LAYER, 
this->weights_layer_);
   EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_WEIGHTS_ITER,  
this->weights_iter_);
   EmplaceNetArgs(&this->net_args_, MKLDNN_ARG_BIAS,          this->bias_);
+  if (use_proj) EmplaceNetArgs(&this->net_args_, DNNL_ARG_WEIGHTS_PROJECTION, 
this->weights_proj_);
 
   if (!is_train) {
     // Reorder after adjustment only when is_train == false. When is_train == 
true, i.e.
@@ -628,31 +673,22 @@ void MKLDNNRnnForwardTraining::FetchData(const 
MKLDNNRnnForward& fwd) {
   }
 }
 
-void MKLDNNRnnOp::Init(const OpContext &ctx,
+void MKLDNNRnnOp::Init(const OpContext &op_ctx,
                        const std::vector<NDArray> &inputs,
                        const std::vector<OpReqType> &req,
                        const std::vector<NDArray> &outputs) {
-  using memory = mkldnn::memory;
   using format_tag = mkldnn::memory::format_tag;
 
   // In the `autograd.record()` context, RNNOp is required to run into
   // `forward_training` mode.
-  const bool is_training = (ctx.is_train || ctx.need_grad);
+  const bool is_training = (op_ctx.is_train || op_ctx.need_grad);
   const size_t num_fusion = full_param_.layer_params.size();
+  const Context& ctx = op_ctx.run_ctx.ctx;
   if (fwd_inf_vec_.size() < num_fusion) {
-    size_t buffer_size = 0;  // Element number, instead of bytes, in the buffer
-    for (auto& layer_param : full_param_.layer_params) {
-      buffer_size += layer_param.workspace_size + layer_param.reserve_size;
-    }
-    buffer_size += outputs[rnn_enum::kOut].data().Size() * (num_fusion - 1);
-    buffer_size += kMKLDNNAlign * num_fusion * 5;  // Add margin for alignment
-
     for (auto& layer_param : full_param_.layer_params) {
-      fwd_inf_vec_.emplace_back(layer_param,
-          ctx.is_train, inputs[rnn_enum::kData], inputs[rnn_enum::kParams]);
-      buffer_size += 
fwd_inf_vec_.back().GetSize(inputs[rnn_enum::kParams].dtype());
+      fwd_inf_vec_.emplace_back(ctx, layer_param, false, 
inputs[rnn_enum::kData],
+          inputs[rnn_enum::kParams]);
     }
-    mgr_.Init(buffer_size, ctx.run_ctx.ctx, inputs[rnn_enum::kParams].dtype());
   }
 
   if (is_training && fwd_trn_vec_.size() < num_fusion) {
@@ -680,7 +716,7 @@ void MKLDNNRnnOp::Init(const OpContext &ctx,
     size_t layer_bias_bytes = single_b_bytes * directions;  // Naive MXNet has 
double bias
 
     if (!fwd_layer.IsInitialized() || is_training)
-      fwd_layer.SetWeightsMem(&(this->mgr_), weights_ptr, bias_ptr, 
is_training, dtype);
+      fwd_layer.SetWeightsMem(weights_ptr, bias_ptr, is_training, dtype);
     weights_ptr += layer_weights_bytes;
     bias_ptr += layer_bias_bytes;
   }
@@ -696,6 +732,10 @@ void MKLDNNRnnOp::Init(const OpContext &ctx,
       "Layer vector's size has a different value than the number of fusion.";
   if (dst_.size() < num_fusion - 1) {
     int data_dtype = outputs[rnn_enum::kOut].dtype();
+    const size_t data_dbytes = mshadow::mshadow_sizeof(data_dtype);
+    mgr_.Init(
+        (outputs[rnn_enum::kOut].data().Size() * data_dbytes + kMKLDNNAlign) * 
(num_fusion - 1),
+        op_ctx.run_ctx.ctx);
     // Here we need `fwd_inf_vec_.size() - 1` spaces for the intermediate 
results of the multiple
     // fused layers. And for the result of the last fused layer, 
`outputs[rnn_enum::kOut]` could
     // provide the space. Hence, `forward_inf_vec_.back()` is excluded when 
allocates the spaces
@@ -960,6 +1000,8 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
   // forward_training mode.
   const bool is_training = (ctx.is_train || ctx.need_grad);
   const RNNParam& default_param = full_param_.default_param;
+  if (is_training && default_param.projection_size.has_value())
+    LOG(FATAL) << "Backward/Training mode is not implemented!";
 
   // Initialize weights version
   if (!initialized_ && weights_version_ == 0) {
@@ -974,7 +1016,13 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
     weights_version_ = inputs[rnn_enum::kParams].version();
   }
 
-  if (!initialized_ || is_training || fwd_inf_vec_.size() == 0) {
+  if (dmlc::GetEnv("MXNET_RNN_USE_WEIGHT_CACHE", 0) && !initialized_) {
+    LOG(INFO) << "The current weight of RNN is assumed to be fixed and cached 
during "
+        "the whole inference pipeline. Please set 
MXNET_RNN_USE_WEIGHT_CACHE=0, if "
+        "the weight changed at runtime.";
+  }
+  if ((!dmlc::GetEnv("MXNET_RNN_USE_WEIGHT_CACHE", 0) && !initialized_) ||
+      is_training || fwd_inf_vec_.size() == 0) {
     Init(ctx, inputs, req, outputs);
   }
 
@@ -985,10 +1033,14 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
   const int seq_length = default_param.seq_length_;
   const int batch_size = default_param.batch_size_;
   const int state_size = default_param.state_size;
+  const int iter_size  = default_param.projection_size.has_value() ?
+      default_param.projection_size.value() : default_param.state_size;
   const int directions = default_param.bidirectional ? 2 : 1;
-  mkldnn::memory::desc dst_desc({seq_length, batch_size, directions * 
state_size},
+  mkldnn::memory::desc dst_desc({seq_length, batch_size, directions * 
iter_size},
       get_mkldnn_type(data_dtype), mkldnn::memory::format_tag::tnc);
-  mkldnn::memory::desc state_desc({num_layers, directions, batch_size, 
state_size},
+  mkldnn::memory::desc state_desc({num_layers, directions, batch_size, 
iter_size},
+      get_mkldnn_type(data_dtype), mkldnn::memory::format_tag::ldnc);
+  mkldnn::memory::desc cell_desc({num_layers, directions, batch_size, 
state_size},
       get_mkldnn_type(data_dtype), mkldnn::memory::format_tag::ldnc);
   auto out_mem = CreateMKLDNNMem(outputs[rnn_enum::kOut], dst_desc, 
req[rnn_enum::kOut]);
   mkldnn_output_t stateout_mem;
@@ -1012,7 +1064,7 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
     src_state_cell = static_cast<char 
*>(inputs[rnn_enum::kStateCell].data().dptr_);
     if (default_param.state_outputs && req[rnn_enum::kStateCellOut] != 
kNullOp) {
       statecellout_mem = CreateMKLDNNMem(
-          outputs[rnn_enum::kStateCellOut], state_desc, 
req[rnn_enum::kStateCellOut]);
+          outputs[rnn_enum::kStateCellOut], cell_desc, 
req[rnn_enum::kStateCellOut]);
       dst_state_cell = static_cast<char 
*>(statecellout_mem.second->get_data_handle());
     }
   }
@@ -1025,8 +1077,10 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
     }
   } else {
     CHECK_EQ(fwd_inf_vec_.size(), dst_.size() + 1) << "Output memory error.";
+    size_t state_bytes = (default_param.bidirectional + 1) * 
default_param.batch_size_ *
+        iter_size * mshadow::mshadow_sizeof(data_dtype);
     size_t cell_bytes = (default_param.bidirectional + 1) * 
default_param.batch_size_ *
-        default_param.state_size * mshadow::mshadow_sizeof(data_dtype);
+        state_size * mshadow::mshadow_sizeof(data_dtype);
 
     // Set input data memory for the first layer. This stores intermediate 
output
     // results in this->xxx, used as the source input of the next layer.
@@ -1037,9 +1091,9 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
     }
     // 1st_lyr -> dst_handle -> next_lyr -> dst_handle -> next_lyr -> ...
     for (size_t lyr = 1; lyr < fwd_inf_vec_.size() - 1; ++lyr) {
-      src_state += cell_bytes;
+      src_state += state_bytes;
+      if (dst_state) dst_state += state_bytes;
       if (src_state_cell) src_state_cell += cell_bytes;
-      if (dst_state) dst_state += cell_bytes;
       if (dst_state_cell) dst_state_cell += cell_bytes;
       fwd_inf_vec_.at(lyr).SetNewDataMem(this->dst_.at(lyr - 
1)->get_data_handle(),
           src_state, src_state_cell,
@@ -1049,9 +1103,9 @@ void MKLDNNRnnOp::Forward(const OpContext &ctx,
       }
     }
     // Set output data memory for the last layer.
-    src_state += cell_bytes;
+    src_state += state_bytes;
+    if (dst_state) dst_state += state_bytes;
     if (src_state_cell) src_state_cell += cell_bytes;
-    if (dst_state) dst_state += cell_bytes;
     if (dst_state_cell) dst_state_cell += cell_bytes;
     fwd_inf_vec_.back().SetNewDataMem(this->dst_.back()->get_data_handle(),
         src_state, src_state_cell, dst, dst_state, dst_state_cell, data_dtype);
@@ -1148,7 +1202,7 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
     bwd_vec_.back().SetDataGradsMem(dx, dhx, dcx, dy, dhy, dcy, data_dtype);
     RegisterMKLDNNRnn(bwd_vec_.back());
   } else {
-    const size_t cell_bytes = (default_param.bidirectional + 1) * 
default_param.batch_size_ *
+    const size_t state_bytes = (default_param.bidirectional + 1) * 
default_param.batch_size_ *
         default_param.state_size * mshadow::mshadow_sizeof(data_dtype);
     if (diff_src == nullptr) {
       auto desc = 
mkldnn::memory::desc(full_param_.layer_params.back().src_dims,
@@ -1159,17 +1213,17 @@ void MKLDNNRnnOp::Backward(const OpContext& ctx,
     bwd_vec_.front().SetDataGradsMem(dx, dhx, dcx,
         diff_src->get_data_handle(), dhy, dcy, data_dtype);
     for (size_t lyr = 1; lyr < bwd_vec_.size() - 1; ++lyr) {
-      if (dhx) dhx += cell_bytes;
-      if (dcx) dcx += cell_bytes;
-      if (dhy) dhy += cell_bytes;
-      if (dcy) dcy += cell_bytes;
+      if (dhx) dhx += state_bytes;
+      if (dcx) dcx += state_bytes;
+      if (dhy) dhy += state_bytes;
+      if (dcy) dcy += state_bytes;
       bwd_vec_.at(lyr).SetDataGradsMem(diff_src->get_data_handle(), dhx, dcx,
           diff_src->get_data_handle(), dhy, dcy, data_dtype);
     }
-    if (dhx) dhx += cell_bytes;
-    if (dcx) dcx += cell_bytes;
-    if (dhy) dhy += cell_bytes;
-    if (dcy) dcy += cell_bytes;
+    if (dhx) dhx += state_bytes;
+    if (dcx) dcx += state_bytes;
+    if (dhy) dhy += state_bytes;
+    if (dcy) dcy += state_bytes;
     bwd_vec_.back().SetDataGradsMem(diff_src->get_data_handle(), dhx, dcx,
         dy, dhy, dcy, data_dtype);
 
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index db23603..4068007 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -185,6 +185,7 @@ inline int GetRnnBiasSize(int num_layer,
 inline size_t GetRNNWorkspaceSize(int seq_length,
                                   int batch_size,
                                   int hidden_size,
+                                  int projection_size,
                                   int direction,
                                   int mode) {
   size_t size = 0;
@@ -324,6 +325,7 @@ void RNNForwardInference(DType* ws,
                          const int batch_size,
                          const int input_size,
                          const int state_size,
+                         const int projection_size,
                          DType* x_ptr,
                          DType* hx_ptr,
                          DType* cx_ptr,
@@ -336,8 +338,8 @@ void RNNForwardInference(DType* ws,
   switch (mode) {
     case rnn_enum::kLstm:
       LstmForwardInference<DType>(ws, state_outputs, num_layers, direction, 
seq_length,
-                                  batch_size, input_size, state_size, x_ptr, 
hx_ptr, cx_ptr,
-                                  w_ptr, b_ptr, y_ptr, hy_ptr, cy_ptr);
+                                  batch_size, input_size, state_size, 
projection_size,
+                                  x_ptr, hx_ptr, cx_ptr, w_ptr, b_ptr, y_ptr, 
hy_ptr, cy_ptr);
       break;
     case rnn_enum::kGru:
       GruForwardInference<DType>(ws, state_outputs, num_layers, direction, 
seq_length,
@@ -511,10 +513,7 @@ class RNNOp {
       this->temp_init_space_ = false;
       this->reserve_cpu_space_size_ = 0;
       this->temp_cpu_space_size_ = 0;
-      if (param_.projection_size.has_value()) {
-        LOG(FATAL) <<
-            "hidden layer projection is only supported for GPU with CuDNN 
later than 7.1.1";
-      }
+
       if (param_.lstm_state_clip_min.has_value()
           || param_.lstm_state_clip_max.has_value()) {
         LOG(FATAL) << "LSTM state clipping is only supported for GPU with 
CuDNN later than 7.2.1";
@@ -843,9 +842,14 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
     if (ctx_.dev_type == kCPU) {
+      int projection_size = 0;
+      if (param_.projection_size.has_value()) {
+        projection_size = param_.projection_size.value();
+      }
+
       // allocate temp space
       const size_t work_cpu_space_size = 
GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-          param_.state_size, direction, param_.mode);
+          param_.state_size, projection_size, direction, param_.mode);
       if (!temp_init_space_ || temp_cpu_space_size_ < work_cpu_space_size) {
         temp_cpu_space_size_ = work_cpu_space_size;
         temp_cpu_space_ = 
NDArray(TShape({static_cast<dim_t>(temp_cpu_space_size_)}), ctx_,
@@ -856,6 +860,9 @@ class RNNOp {
 
       if (ctx.is_train || ctx.need_grad) {
         // allocate reserve space
+        if (param_.projection_size.has_value()) {
+          LOG(FATAL) << "No training support for LSTM with projection on CPU 
currently.";
+        }
 
         const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, 
direction,
                                                      param_.seq_length_, 
param_.batch_size_,
@@ -896,6 +903,7 @@ class RNNOp {
                                    param_.batch_size_,
                                    param_.input_size_,
                                    param_.state_size,
+                                   projection_size,
                                    x.dptr_,
                                    hx.dptr_,
                                    cx_ptr,
@@ -1096,10 +1104,17 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
     if (ctx_.dev_type == kCPU) {
+      int projection_size = 0;
+      if (param_.projection_size.has_value()) {
+        // TODO(zixuanweeei): Add training support for LSTM with projection on 
CPU.
+        // projection_size = param_.projection_size.value();
+        LOG(FATAL) << "No training support for LSTM with projection on CPU 
currently.";
+      }
+
       // allocate temp space
       const size_t work_cpu_space_size =
-          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                              param_.state_size, direction, param_.mode);
+          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_, 
param_.state_size,
+                              projection_size, direction, param_.mode);
       if (!temp_init_space_ || temp_cpu_space_size_ != work_cpu_space_size) {
         LOG(FATAL) << "Check temp init error";
       }
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index a8e1b12..97fd754 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -190,20 +190,17 @@ static std::vector<ResourceRequest> RNNResourceEx(const 
NodeAttrs& attrs, const
   return request;
 }
 
+#if MXNET_USE_MKLDNN == 1
 inline static bool RNNStorageType(const nnvm::NodeAttrs& attrs,
                                   const int dev_mask,
                                   DispatchMode* dispatch_mode,
                                   std::vector<int> *in_attrs,
                                   std::vector<int> *out_attrs) {
-  DispatchMode wanted_mode = DispatchMode::kFCompute;
-
-#if MXNET_USE_MKLDNN == 1
-  wanted_mode = DispatchMode::kFComputeEx;
-#endif  // MXNET_USE_MKLDNN == 1
-
-  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                             dispatch_mode, wanted_mode);
+  const bool support_mkldnn_rnn = dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1);
+  return MKLDNNStorageType(attrs, dev_mask, support_mkldnn_rnn,
+                           dispatch_mode, in_attrs, out_attrs);
 }
+#endif  // MXNET_USE_MKLDNN == 1
 
 struct RNNGrad {
   const char *op_name;
@@ -246,9 +243,7 @@ static OpStatePtr CreateRNNState(const nnvm::NodeAttrs 
&attrs,
   }
 
 #if MXNET_USE_MKLDNN == 1
-  if ((in_types[0] == mshadow::kFloat32 || in_types[0] == mshadow::kFloat16)
-      && in_shapes[0].ndim() == 3 && ctx.dev_type == kCPU
-      && dmlc::GetEnv("MXNET_USE_MKLDNN_RNN", 1)) {
+  if (ctx.dev_type == kCPU && SupportMKLDNNRnn(in_types[rnn_enum::kData])) {
     const mxnet::TShape& data_shape = in_shapes[rnn_enum::kData];
     state = OpStatePtr::Create<MKLDNNRnnOp>(param, data_shape[0],
         data_shape[1], data_shape[2]);
@@ -274,7 +269,7 @@ static void RNNStatefulComputeExCPU(const OpStatePtr& 
state_ptr,
                                     const std::vector<NDArray>& inputs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNRnn(inputs[0])) {
+  if (SupportMKLDNNRnn(inputs[rnn_enum::kData].dtype())) {
     MKLDNNRnnOp& op = state_ptr.get_state<MKLDNNRnnOp>();
     op.Forward(ctx, inputs, req, outputs);
   } else {
@@ -287,7 +282,7 @@ static void RNNStatefulGradComputeExCPU(const OpStatePtr& 
state_ptr,
                                         const std::vector<NDArray>& inputs,
                                         const std::vector<OpReqType>& req,
                                         const std::vector<NDArray>& outputs) {
-  if (SupportMKLDNNRnn(inputs[0])) {
+  if (SupportMKLDNNRnn(inputs[rnn_enum::kData].dtype())) {
     MKLDNNRnnOp& op = state_ptr.get_state<MKLDNNRnnOp>();
     op.Backward(ctx, inputs, req, outputs);
   } else {
@@ -338,6 +333,23 @@ Long Short-Term Memory - Hochreiter, 1997. 
http://www.bioinf.jku.at/publications
             h_t = o_t * \tanh(c_t)
             \end{array}
 
+With the projection size being set, LSTM could use the projection feature to 
reduce the parameters
+size and give some speedups without significant damage to the accuracy.
+
+Long Short-Term Memory Based Recurrent Neural Network Architectures for Large 
Vocabulary Speech
+Recognition - Sak et al. 2014. https://arxiv.org/abs/1402.1128
+
+.. math::
+  \begin{array}{ll}
+            i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{ri} r_{(t-1)} + 
b_{ri}) \\
+            f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{rf} r_{(t-1)} + 
b_{rf}) \\
+            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{rc} r_{(t-1)} + b_{rg}) \\
+            o_t = \mathrm{sigmoid}(W_{io} x_t + b_{o} + W_{ro} r_{(t-1)} + 
b_{ro}) \\
+            c_t = f_t * c_{(t-1)} + i_t * g_t \\
+            h_t = o_t * \tanh(c_t)
+            r_t = W_{hr} h_t
+            \end{array}
+
 **GRU**
 
 Gated Recurrent Unit - Cho et al. 2014. http://arxiv.org/abs/1406.1078
@@ -385,10 +397,10 @@ The definition of GRU here is slightly different from 
paper but compatible with
 })
 .set_attr<mxnet::FInferShape>("FInferShape", RNNShape)
 .set_attr<nnvm::FInferType>("FInferType", RNNType)
-.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<FCreateOpState>("FCreateOpState", CreateRNNState)
 .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", 
RNNStatefulComputeExCPU)
 #endif
@@ -413,9 +425,9 @@ NNVM_REGISTER_OP(_backward_RNN)
 .set_attr_parser(ParamParser<RNNParam>)
 .set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<FStatefulCompute>("FStatefulCompute<cpu>", 
RNNStatefulGradCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
+.set_attr<FInferStorageType>("FInferStorageType", RNNStorageType)
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", 
RNNStatefulGradComputeExCPU)
 #endif
diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h
index e1b4a2b..22b7130 100644
--- a/src/operator/rnn_impl.h
+++ b/src/operator/rnn_impl.h
@@ -209,6 +209,7 @@ void LstmForwardInferenceSingleLayer(DType* ws,
                                      const int N,
                                      const int I,
                                      const int H,
+                                     const int P,
                                      const Tensor<cpu, 2, DType> &x,
                                      const Tensor<cpu, 2, DType> &hx,
                                      const Tensor<cpu, 2, DType> &cx,
@@ -219,7 +220,9 @@ void LstmForwardInferenceSingleLayer(DType* ws,
                                      DType* cy_ptr) {
   using namespace mshadow;
   const Tensor<cpu, 2, DType> wx(w_ptr, Shape2(H * 4, I));
-  const Tensor<cpu, 2, DType> wh(w_ptr + I * H * 4, Shape2(H * 4, H));
+  const Tensor<cpu, 2, DType> wh(w_ptr + I * H * 4, Shape2(H * 4, (P ? P : 
H)));
+  Tensor<cpu, 2, DType> whr(w_ptr, Shape2(1, 1));
+  if (P > 0) whr = Tensor<cpu, 2, DType>(wh.dptr_ + P * 4 * H, Shape2(P, H));
   const Tensor<cpu, 2, DType> bx(b_ptr, Shape2(4, H));
   const Tensor<cpu, 2, DType> bh(b_ptr + H * 4, Shape2(4, H));
   Tensor<cpu, 2, DType> yx_flat(ws, Shape2(T * N, H * 4));
@@ -228,7 +231,10 @@ void LstmForwardInferenceSingleLayer(DType* ws,
   const Tensor<cpu, 3, DType> yh(yh_flat.dptr_, Shape3(N, 4, H));
   Tensor<cpu, 2, DType> h(yh_flat.dptr_ + N * H * 4, Shape2(N, H));
   Tensor<cpu, 2, DType> c(h.dptr_ + N * H, Shape2(N, H));
+  Tensor<cpu, 2, DType> r(hy_ptr, Shape2(1, 1));
+  if (P > 0) r = Tensor<cpu, 2, DType>(hy_ptr, Shape2(N, P));
   const int offset = bid ? H : 0;
+  const int proj_offset = bid ? P : 0;
   const DType alpha = 1.0;
   const DType beta = 0.0;
   const int cell_size = N * H;
@@ -237,7 +243,11 @@ void LstmForwardInferenceSingleLayer(DType* ws,
   const int omp_threads = 
mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
   for (int i = 0; i < T; ++i) {
     int t = bid ? T - 1 - i : i;
-    linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
+    if (P > 0) {
+      linalg_gemm(i ? r : hx, wh, yh_flat, alpha, beta, false, true);
+    } else {
+      linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true);
+    }
     #pragma omp parallel for num_threads(omp_threads)
     for (int jk = 0; jk < cell_size; ++jk) {
       int j = jk / H;
@@ -248,14 +258,26 @@ void LstmForwardInferenceSingleLayer(DType* ws,
       DType ot = sigmoid<DType>(yx[t][j][3][k] + yh[j][3][k] + bx[3][k] + 
bh[3][k]);
       DType ct = (i ? c[j][k] : cx[j][k]) * ft + it * gt;
       DType ht = ot * tanh(ct);
-      y[t][j][k + offset] = ht;
+      if (P == 0) y[t][j][k + offset] = ht;
       if (i == T - 1 && state_outputs) {
-        hy_ptr[jk] = ht;
+        if (P == 0) hy_ptr[jk] = ht;
         cy_ptr[jk] = ct;
       } else {
-        h[j][k] = ht;
         c[j][k] = ct;
       }
+      h[j][k] = ht;
+    }
+    if (P > 0) {
+      linalg_gemm(h, whr, r, alpha, beta, false, true);
+#pragma GCC diagnostic push
+#if __GNUC__ >= 8
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+#pragma omp parallel for num_threads(omp_threads)
+      for (int j = 0; j < N; ++j) {
+        std::memcpy(y[t][j].dptr_ + proj_offset, r[j].dptr_, P * 
sizeof(DType));
+      }
+#pragma GCC diagnostic pop
     }
   }
 }
@@ -269,6 +291,7 @@ void LstmForwardInference(DType* ws,
                           const int N,
                           const int I,
                           const int H,
+                          const int P,
                           DType* x_ptr,
                           DType* hx_ptr,
                           DType* cx_ptr,
@@ -278,25 +301,29 @@ void LstmForwardInference(DType* ws,
                           DType* hy_ptr,
                           DType* cy_ptr) {
   const int total_layers = D * L;
-  Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, H));
+  Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(total_layers, N, P ? P : H));
   Tensor<cpu, 3, DType> cx(cx_ptr, Shape3(total_layers, N, H));
   const int b_size = 2 * H * 4;
   const int cell_size = N * H;
+  const int projection_size = (P ? P : H) * N;
   DType* y_tmp_ptr = ws + (T + 1) * cell_size * 4 + cell_size * 2;
   DType* y_cur_ptr = y_ptr;
   int idx = 0;  // state & cell state's idx;
   bool flag = L % 2 ? false : true;
   for (int i = 0; i < L; ++i) {
-    const int input_size = i ? H * D : I;
-    const int w_size = (input_size + H) * H * 4;
+    const int input_size = i ? (P ? P : H) * D : I;
+    int w_size = (input_size + (P ? P : H)) * H * 4;
+    if (P > 0) {
+      w_size += P * H;
+    }
     // If bidirectional, need space to save current layer output y.
     if (D == 2) {
       y_cur_ptr = flag ? y_tmp_ptr : y_ptr;
       flag = !flag;
     }
     Tensor<cpu, 2, DType> x(x_ptr, Shape2(T * N, input_size));
-    Tensor<cpu, 3, DType> y(y_cur_ptr, Shape3(T, N, H * D));
-    LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, false, T, N, 
input_size, H,
+    Tensor<cpu, 3, DType> y(y_cur_ptr, Shape3(T, N, (P ? P : H) * D));
+    LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, false, T, N, 
input_size, H, P,
                                            x, hx[idx], cx[idx], y, w_ptr, 
b_ptr, hy_ptr, cy_ptr);
     // If bidirectional, then calculate the reverse direction's forward result.
     if (D == 2) {
@@ -304,10 +331,10 @@ void LstmForwardInference(DType* ws,
       b_ptr += b_size;
       ++idx;
       if (state_outputs) {
-        hy_ptr += cell_size;
+        hy_ptr += projection_size;
         cy_ptr += cell_size;
       }
-      LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, true, T, N, 
input_size, H,
+      LstmForwardInferenceSingleLayer<DType>(ws, state_outputs, true, T, N, 
input_size, H, P,
                                              x, hx[idx], cx[idx], y, w_ptr, 
b_ptr, hy_ptr, cy_ptr);
     }
     // Don't need to move pointer in the last layer.
@@ -317,7 +344,7 @@ void LstmForwardInference(DType* ws,
       x_ptr = y_cur_ptr;
       ++idx;
       if (state_outputs) {
-        hy_ptr += cell_size;
+        hy_ptr += projection_size;
         cy_ptr += cell_size;
       }
     }
diff --git a/tests/cpp/operator/mkldnn_test.cc 
b/tests/cpp/operator/mkldnn_test.cc
index bcdb38a..73b9d93 100644
--- a/tests/cpp/operator/mkldnn_test.cc
+++ b/tests/cpp/operator/mkldnn_test.cc
@@ -100,7 +100,7 @@ static void VerifyDefMem(const mkldnn::memory &mem) {
 
 TEST(MKLDNN_UTIL_FUNC, MemFormat) {
   // Check whether the number of format is correct.
-  CHECK_EQ(mkldnn_format_tag_last, 131);
+  CHECK_EQ(mkldnn_format_tag_last, 168);
   CHECK_EQ(mkldnn_nchw, 5);
   CHECK_EQ(mkldnn_oihw, 5);
 }
diff --git a/tests/nightly/JenkinsfileForBinaries 
b/tests/nightly/JenkinsfileForBinaries
index 2b55c05..eeb2b3c 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -20,7 +20,7 @@
 
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libtvm_runtime.so, 
lib/libtvmop.so, lib/tvmop.conf, 3rdparty/dmlc-core/libdmlc.a, 
3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, 
build/3rdparty/tvm/libtvm_runtime.so, build/libtvmop.so, build/tvmop.conf, 
build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, 
build/3rdparty/openmp/runtime/src/libomp.so'
-mx_lib_cpp_example_mkl = 'lib/libmxnet.so, lib/libmxnet.a, 
lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, 
3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 
build/cpp-package/example/imagenet_inference, lib/libmkldnn.so.1'
+mx_lib_cpp_example_mkl = 'lib/libmxnet.so, lib/libmxnet.a, 
lib/libtvm_runtime.so, lib/libtvmop.so, lib/tvmop.conf, 
3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 
build/cpp-package/example/imagenet_inference, lib/libdnnl.so.1'
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/tests/python/mkl/test_mkldnn_install.py 
b/tests/python/mkl/test_mkldnn_install.py
index c2f26df..cc04981 100644
--- a/tests/python/mkl/test_mkldnn_install.py
+++ b/tests/python/mkl/test_mkldnn_install.py
@@ -46,7 +46,7 @@ def test_mkldnn_install():
 
     pid = os.getpid()
     rc = os.system("cat /proc/" + str(pid) +
-                   "/maps | grep libmkldnn > /dev/null")
+                   "/maps | grep libdnnl > /dev/null")
 
     if rc == 0:
         logging.info("MXNet is built/installed correctly with MKL-DNN")
diff --git a/tests/python/unittest/test_gluon_rnn.py 
b/tests/python/unittest/test_gluon_rnn.py
index 0f27f53..790b1ec 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -26,6 +26,25 @@ import unittest
 from mxnet.test_utils import almost_equal, assert_almost_equal
 from common import assert_raises_cudnn_not_satisfied, with_seed
 
+
+def check_rnn_states(fused_states, stack_states, num_layers, 
bidirectional=False, is_lstm=True):
+    directions = 2 if bidirectional else 1
+    assert len(stack_states) / len(fused_states) == num_layers * directions
+
+    fused_states = [state.asnumpy() for state in fused_states]
+    stack_states = [np.expand_dims(state.asnumpy(), axis=0) for state in 
stack_states]
+    if is_lstm:
+        stack_states_h = stack_states[0::2]
+        stack_states_c = stack_states[1::2]
+        stack_states = [np.concatenate(stack_states_h, axis=0), 
np.concatenate(stack_states_c, axis=0)]
+    else:
+        stack_states = [np.concatenate(stack_states, axis=0)]
+
+    for f, s in zip(fused_states, stack_states):
+        assert f.shape == s.shape
+        assert_almost_equal(f, s, atol=1e-4, rtol=1e-4)
+
+
 def test_rnn():
     cell = gluon.rnn.RNNCell(100, prefix='rnn_')
     inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
@@ -51,6 +70,88 @@ def test_lstm():
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
+@with_seed()
+@assert_raises_cudnn_not_satisfied(min_version='7.2.1')
+def test_lstmp():
+    hidden_size, projection_size = 512, 256
+    rtol, atol = 1e-4, 1e-4
+    batch_size, seq_len = 5, 3
+    input_size = 128
+    lstm_input = mx.nd.uniform(shape=(seq_len, batch_size, input_size))
+
+    # ==== Unidirectional Layer ====
+    for num_layers in [1, 3]:
+        fused_layer = gluon.rnn.LSTM(hidden_size, 
projection_size=projection_size,
+                                    num_layers=num_layers, layout='TNC', 
bidirectional=False,
+                                    prefix='lstm0_')
+
+        stack_layer = mx.gluon.rnn.HybridSequentialRNNCell(prefix='lstm0_')
+        with stack_layer.name_scope():
+            for i in range(num_layers):
+                stack_layer.add(gluon.contrib.rnn.LSTMPCell(hidden_size,
+                                                            
projection_size=projection_size,
+                                                            prefix='l%d_' % i))
+        fused_layer.initialize()
+        stack_layer.initialize()
+
+        fused_begin_state = fused_layer.begin_state(batch_size)
+        stack_begin_state = stack_layer.begin_state(batch_size=batch_size)
+        fused_layer.infer_shape(lstm_input, fused_begin_state)
+        fused_layer_params = fused_layer.collect_params()
+        stack_layer_params = stack_layer.collect_params()
+
+        for name, value in fused_layer_params.items():
+            w = mx.nd.random.uniform(shape=value.shape)
+            value.set_data(w.copy())
+            stack_layer_params[name].set_data(w.copy())
+
+        fused_output, fused_states = fused_layer(lstm_input.copy(), 
fused_begin_state)
+        stack_output, stack_states = stack_layer.unroll(seq_len, 
lstm_input.copy(), begin_state=stack_begin_state,
+                                                        layout='TNC',
+                                                        merge_outputs=True)
+
+        assert_almost_equal(fused_output.asnumpy(), stack_output.asnumpy(), 
rtol=rtol, atol=atol)
+        check_rnn_states(fused_states, stack_states, num_layers, False)
+
+    # ==== Bidirectional Layer ====
+    for num_layers in [1, 3]:
+        fused_layer = gluon.rnn.LSTM(hidden_size, 
projection_size=projection_size,
+                                    num_layers=num_layers, layout='TNC', 
bidirectional=True,
+                                    prefix='lstm0_')
+
+        stack_layer = mx.gluon.rnn.HybridSequentialRNNCell(prefix='lstm0_')
+        with stack_layer.name_scope():
+            for i in range(num_layers):
+                stack_layer.add(
+                    
gluon.rnn.BidirectionalCell(gluon.contrib.rnn.LSTMPCell(hidden_size,
+                                                                            
projection_size=projection_size,
+                                                                            
prefix='l%d_' % i),
+                                                
gluon.contrib.rnn.LSTMPCell(hidden_size,
+                                                                            
projection_size=projection_size,
+                                                                            
prefix='r%d_' % i)))
+        fused_layer.initialize()
+        stack_layer.initialize()
+
+        fused_begin_state = fused_layer.begin_state(batch_size)
+        stack_begin_state = stack_layer.begin_state(batch_size=batch_size)
+        fused_layer.infer_shape(lstm_input, fused_begin_state)
+        fused_layer_params = fused_layer.collect_params()
+        stack_layer_params = stack_layer.collect_params()
+
+        for name, value in fused_layer_params.items():
+            w = mx.nd.random.uniform(shape=value.shape)
+            value.set_data(w.copy())
+            stack_layer_params[name].set_data(w.copy())
+
+        fused_output, fused_states = fused_layer(lstm_input.copy(), 
fused_begin_state)
+        stack_output, stack_states = stack_layer.unroll(seq_len, 
lstm_input.copy(), begin_state=stack_begin_state,
+                                                        layout='TNC',
+                                                        merge_outputs=True)
+
+        assert_almost_equal(fused_output.asnumpy(), stack_output.asnumpy(), 
rtol=rtol, atol=atol)
+        check_rnn_states(fused_states, stack_states, num_layers, True)
+
+
 def test_lstm_forget_bias():
     forget_bias = 2.0
     stack = gluon.rnn.SequentialRNNCell()
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
index dd430f5..897d412 100644
--- a/tools/pip/setup.py
+++ b/tools/pip/setup.py
@@ -147,11 +147,11 @@ package_data = {'mxnet': [os.path.join('mxnet', 
os.path.basename(LIB_PATH[0]))],
                 'dmlc_tracker': []}
 if variant.endswith('MKL'):
     if platform.system() == 'Darwin':
-        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 
'libmkldnn.1.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
-        package_data['mxnet'].append('mxnet/libmkldnn.1.dylib')
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 
'libdnnl.1.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        package_data['mxnet'].append('mxnet/libdnnl.1.dylib')
     else:
-        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 
'libmkldnn.so.1'), os.path.join(CURRENT_DIR, 'mxnet'))
-        package_data['mxnet'].append('mxnet/libmkldnn.so.1')
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 
'libdnnl.so.1'), os.path.join(CURRENT_DIR, 'mxnet'))
+        package_data['mxnet'].append('mxnet/libdnnl.so.1')
     shutil.copytree(os.path.join(CURRENT_DIR, 
'mxnet-build/3rdparty/mkldnn/build/install/include'),
                     os.path.join(CURRENT_DIR, 'mxnet/include/mkldnn'))
 if platform.system() == 'Linux':
diff --git a/tools/staticbuild/build_lib.sh b/tools/staticbuild/build_lib.sh
index 4a82b80..28341cf 100755
--- a/tools/staticbuild/build_lib.sh
+++ b/tools/staticbuild/build_lib.sh
@@ -35,9 +35,9 @@ $MAKE DEPS_PATH=$DEPS_PATH PSLITE
 
 if [[ $VARIANT == *mkl ]]; then
     if [[ $PLATFORM == 'linux' ]]; then
-        MKLDNN_LIBFILE='libmkldnn.so.1'
+        MKLDNN_LIBFILE='libdnnl.so.1'
     else
-        MKLDNN_LIBFILE='libmkldnn.1.dylib'
+        MKLDNN_LIBFILE='libdnnl.1.dylib'
     fi
     $MAKE DEPS_PATH=$DEPS_PATH mkldnn
     if [ ! -d lib ]; then

[incubator-mxnet] branch v1.6.x updated: [MKLDNN] Add LSTMP to v1.6.x (#17959)

Reply via email to