[incubator-mxnet] branch master updated: NumPy compatible serialization API (#19417)

lausen Thu, 19 Nov 2020 14:47:12 -0800

This is an automated email from the ASF dual-hosted git repository.

lausen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new 5dc404d  NumPy compatible serialization API (#19417)
5dc404d is described below

commit 5dc404dfb0cb5e9a034c9db5d58721e15791ad33
Author: Leonard Lausen <[email protected]>
AuthorDate: Thu Nov 19 15:44:37 2020 -0700

    NumPy compatible serialization API (#19417)
    
    Adopt NumPy and SciPy serialization formats for npx save functions.
    See https://numpy.org/neps/nep-0001-npy-format.html for details.
    
    Fixes #18667 if users use the npx.save / npx.savez functions as npy format 
stores information about the endianness of the saved array. Loading legacy 
parameter formats is still supported. Saving legacy parameter formats is still 
supported via nd.save. Sparse arrays are saved to npz format in analogy to 
SciPy.
---
 .gitmodules                                        |   3 +
 3rdparty/libzip                                    |   1 +
 CMakeLists.txt                                     |  18 +-
 LICENSE                                            |   6 +-
 ci/build_windows.py                                |  27 +-
 ci/docker/Dockerfile.build.android                 |  26 +-
 ci/docker/Dockerfile.build.arm                     |  42 +-
 ci/docker/Dockerfile.build.jetson                  |  14 +-
 .../python/tutorials/packages/np/np-vs-numpy.md    |   9 +-
 include/mxnet/c_api.h                              |  12 +
 licenses/MIT                                       |  19 +
 python/mxnet/gluon/block.py                        |  14 +-
 python/mxnet/ndarray/utils.py                      |  11 +-
 python/mxnet/numpy_extension/utils.py              | 122 ++-
 src/c_api/c_api.cc                                 | 113 ++-
 src/serialization/cnpy.cc                          | 970 +++++++++++++++++++++
 src/serialization/cnpy.h                           |  50 ++
 tests/python/unittest/test_extensions.py           |   7 +-
 tests/python/unittest/test_ndarray.py              |  45 +-
 tests/python/unittest/test_numpy_ndarray.py        |  52 +-
 tests/python/unittest/test_sparse_ndarray.py       | 121 +--
 tests/python/unittest/test_subgraph_op.py          |  13 +-
 22 files changed, 1510 insertions(+), 185 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index dee98ea..4142caf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,6 +25,9 @@
 [submodule "3rdparty/nvidia_cub"]
        path = 3rdparty/nvidia_cub
        url = https://github.com/NVlabs/cub.git
+[submodule "3rdparty/libzip"]
+       path = 3rdparty/libzip
+       url = https://github.com/nih-at/libzip.git
 [submodule "3rdparty/intgemm"]
        path = 3rdparty/intgemm
        url = https://github.com/kpu/intgemm
diff --git a/3rdparty/libzip b/3rdparty/libzip
new file mode 160000
index 0000000..7db2460
--- /dev/null
+++ b/3rdparty/libzip
@@ -0,0 +1 @@
+Subproject commit 7db2460e03e228ff1dffd3febf9a760cdf2044c2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2413c56..e8d10b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,7 +178,6 @@ add_definitions(-DDMLC_STRICT_CXX11)
 add_definitions(-DDMLC_USE_CXX14)
 add_definitions(-DMSHADOW_IN_CXX11)
 if(MSVC)
-  add_definitions(-DWIN32_LEAN_AND_MEAN)
   add_definitions(-D_SCL_SECURE_NO_WARNINGS)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
   add_definitions(-DMXNET_EXPORTS)
@@ -700,6 +699,7 @@ elseif(MSVC)
       foreach(arch ${arch_code_list})
         add_library(mxnet_${arch} SHARED ${SOURCE})
         target_link_libraries(mxnet_${arch} PUBLIC mshadow)
+        target_compile_definitions(mxnet_${arch} PRIVATE -DWIN32_LEAN_AND_MEAN)
         target_compile_options(
           mxnet_${arch}
           PRIVATE
@@ -746,6 +746,7 @@ elseif(MSVC)
       string(REPLACE ";" " " NVCC_FLAGS_ARCH "${NVCC_FLAGS_ARCH}")
       string(APPEND CMAKE_CUDA_FLAGS " ${CUDA_ARCH_FLAGS_SPACES}")
       add_library(mxnet SHARED ${SOURCE})
+      target_compile_definitions(mxnet PRIVATE -DWIN32_LEAN_AND_MEAN)
       target_link_libraries(mxnet PUBLIC mshadow)
       if(MXNET_FORCE_SHARED_CRT)
         target_compile_options(
@@ -777,6 +778,7 @@ elseif(MSVC)
     endif(USE_SPLIT_ARCH_DLL)
   else()
     add_library(mxnet SHARED ${SOURCE})
+    target_compile_definitions(mxnet PRIVATE -DWIN32_LEAN_AND_MEAN)
     target_link_libraries(mxnet PUBLIC mshadow)
   endif()
 endif()
@@ -835,6 +837,20 @@ elseif(MSVC)
   endif()
 endif()
 
+# used for numpy-compatible serialization
+function(load_libzip)
+  set(BUILD_SHARED_LIBS FALSE CACHE BOOL "Build shared libzip libraries")
+  set(CMAKE_POLICY_DEFAULT_CMP0074 NEW)  # Take ZLIB_ROOT into account
+  add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/libzip" 
EXCLUDE_FROM_ALL)
+  if(MSVC AND USE_SPLIT_ARCH_DLL AND USE_CUDA)
+    list(APPEND mxnet_LINKER_LIBS libzip::zip)
+    set(mxnet_LINKER_LIBS "${mxnet_LINKER_LIBS}" PARENT_SCOPE)
+  else()
+    target_link_libraries(mxnet PRIVATE libzip::zip)
+  endif()
+endfunction()
+load_libzip()
+
 if(USE_DIST_KVSTORE)
   add_subdirectory("3rdparty/ps-lite")
   add_definitions(-DMXNET_USE_DIST_KVSTORE)
diff --git a/LICENSE b/LICENSE
index dec14b7..1310702 100644
--- a/LICENSE
+++ b/LICENSE
@@ -235,9 +235,10 @@
     
docs/python_docs/themes/mx-theme/mxtheme/static/material-design-icons-3.0.1 
(Copy of the License available at top of current file)
     docs/python_docs/themes/mx-theme/mxtheme/static/font/Roboto (Copy of the 
License available at top of current file)
     3rdparty/tvm/3rdparty/bfloat16/bfloat16.cc (Copy of the License available 
at top of current file)
+    3rdparty/libzip/regress/ossfuzz.sh
 
     
=======================================================================================
-    MIT license (see licenses/LICENSE.MIT
+    MIT license
     
=======================================================================================
 
     example/gluon/tree_lstm
@@ -263,6 +264,7 @@
     3rdparty/nvidia_cub/test/mersenne.h
     3rdparty/googletest/googlemock
     3rdparty/googletest/googletest
+    3rdparty/libzip
     cmake/upstream/FindCUDAToolkit.cmake
     cmake/upstream/select_compute_arch.cmake
     src/operator/numpy/np_einsum_op-inl.h
@@ -316,6 +318,8 @@
     Apache-2.0 license + MIT License
     
=======================================================================================
 
+    src/serialization/cnpy.h (Copy of the AL2 License available at the top of 
this file, MIT License available at licenses/MIT)
+    src/serialization/cnpy.cc (Copy of the AL2 License available at the top of 
this file, MIT License available at licenses/MIT)
     3rdparty/onnx-tensorrt/third_party/onnx/tools/protoc-gen-mypy.py (Copy of 
the referenced AL2 License available at top of current file)
 
     
=======================================================================================
diff --git a/ci/build_windows.py b/ci/build_windows.py
index f184922..ceab45c 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -151,8 +151,29 @@ def windows_build(args):
     mxnet_root = get_mxnet_root()
     logging.info("Found MXNet root: {}".format(mxnet_root))
 
+    # Install zlib dependency missing from AMI
+    with remember_cwd():
+        tmpdirname = tempfile.mkdtemp()
+        zlib_path = tempfile.mkdtemp()
+        os.chdir(tmpdirname)
+        r = requests.get('https://github.com/madler/zlib/archive/v1.2.11.zip', 
allow_redirects=True)
+        with open('v1.2.11.zip', 'wb') as f:
+            f.write(r.content)
+        with zipfile.ZipFile('v1.2.11.zip', 'r') as zip_ref:
+            zip_ref.extractall('.')
+        os.chdir('zlib-1.2.11')
+        os.mkdir('build')
+        os.chdir('build')
+        cmd = '"{}" && cmake -GNinja -DCMAKE_INSTALL_PREFIX={} 
-DBUILD_SHARED_LIBS=0 ' \
+            "-DCMAKE_C_COMPILER=cl -DCMAKE_BUILD_TYPE=Release .. && " \
+            "ninja install".format(args.vcvars, zlib_path)
+        logging.info("Compiling zlib with CMake:\n{}".format(cmd))
+        check_call(cmd, shell=True)
+    shutil.rmtree(tmpdirname)
+    os.remove(os.path.join(zlib_path, 'lib', 'zlib.lib'))
+
     # cuda thrust / CUB + VS 2019 is flaky: try multiple times if fail
-    MAXIMUM_TRY = 5
+    MAXIMUM_TRY = 1
     build_try = 0
 
     while build_try < MAXIMUM_TRY:
@@ -163,6 +184,7 @@ def windows_build(args):
         with remember_cwd():
             os.chdir(path)
             env = os.environ.copy()
+            env["ZLIB_ROOT"] = zlib_path
             if 'GPU' in args.flavour:
                 env["CXXFLAGS"] = '/FS /MD /O2 /Ob2'
             cmd = "\"{}\" && cmake -GNinja {} {}".format(args.vcvars,
@@ -186,6 +208,9 @@ def windows_build(args):
                 logging.info("Build took 
{}".format(datetime.timedelta(seconds=int(time.time() - t0))))
                 break
 
+    # Cleanup temporary directories
+    shutil.rmtree(zlib_path)
+
     if ret == 0:
         windows_package(args)
     else:
diff --git a/ci/docker/Dockerfile.build.android 
b/ci/docker/Dockerfile.build.android
index 939e6b1..be7bab6 100644
--- a/ci/docker/Dockerfile.build.android
+++ b/ci/docker/Dockerfile.build.android
@@ -35,10 +35,10 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive 
apt-get install -y \
     unzip \
  && rm -rf /var/lib/apt/lists/*
 
-RUN curl -o android-ndk-r19-linux-x86_64.zip -L 
https://dl.google.com/android/repository/android-ndk-r19-linux-x86_64.zip && \
-    unzip android-ndk-r19-linux-x86_64.zip && \
-    rm android-ndk-r19-linux-x86_64.zip
-ENV 
CMAKE_TOOLCHAIN_FILE=/usr/local/android-ndk-r19/build/cmake/android.toolchain.cmake
+RUN curl -o android-ndk-r19c-linux-x86_64.zip -L 
https://dl.google.com/android/repository/android-ndk-r19c-linux-x86_64.zip && \
+    unzip android-ndk-r19c-linux-x86_64.zip && \
+    rm android-ndk-r19c-linux-x86_64.zip
+ENV 
CMAKE_TOOLCHAIN_FILE=/usr/local/android-ndk-r19c/build/cmake/android.toolchain.cmake
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -57,17 +57,15 @@ ENV ARCH=armv7l \
     HOSTCXX=g++ \
     TARGET=ARMV7
 
-RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
-    mkdir /usr/local/openblas-android && \
+RUN git clone --recursive -b v0.3.12 https://github.com/xianyi/OpenBLAS.git && 
\
     cd /usr/local/OpenBLAS && \
-    export 
TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
+    export 
TOOLCHAIN=/usr/local/android-ndk-r19c/toolchains/llvm/prebuilt/linux-x86_64 && \
     make NOFORTRAN=1 ARM_SOFTFP_ABI=1 NO_SHARED=1 \
-        
LDFLAGS="-L/usr/local/android-ndk-r19/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/lib/gcc/arm-linux-androideabi/4.9.x
 -lm" \
+        
LDFLAGS="-L/usr/local/android-ndk-r19c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/lib/gcc/arm-linux-androideabi/4.9.x
 -lm" \
         CC=$TOOLCHAIN/bin/armv7a-linux-androideabi16-clang 
AR=$TOOLCHAIN/bin/arm-linux-androideabi-ar && \
-    make PREFIX=/usr/local/openblas-android NO_SHARED=1 install && \
+    make 
PREFIX=/usr/local/android-ndk-r19c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/
 NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
-ENV OpenBLAS_HOME=/usr/local/openblas-android
 
 WORKDIR /work/build
 
@@ -81,16 +79,14 @@ ENV ARCH=aarch64 \
     HOSTCXX=g++ \
     TARGET=ARMV8
 
-RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
-    mkdir /usr/local/openblas-android && \
+RUN git clone --recursive -b v0.3.12 https://github.com/xianyi/OpenBLAS.git && 
\
     cd /usr/local/OpenBLAS && \
-    export 
TOOLCHAIN=/usr/local/android-ndk-r19/toolchains/llvm/prebuilt/linux-x86_64 && \
+    export 
TOOLCHAIN=/usr/local/android-ndk-r19c/toolchains/llvm/prebuilt/linux-x86_64 && \
     make NOFORTRAN=1 NO_SHARED=1 \
         
LDFLAGS="-L/usr/local/android-ndk-r21/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/lib/gcc/aarch64-linux-android/4.9.x
 -lm" \
         CC=$TOOLCHAIN/bin/aarch64-linux-android21-clang 
AR=$TOOLCHAIN/bin/aarch64-linux-android-ar && \
-    make PREFIX=/usr/local/openblas-android NO_SHARED=1 install && \
+    make 
PREFIX=/usr/local/android-ndk-r19c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/
 NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
-ENV OpenBLAS_HOME=/usr/local/openblas-android
 
 WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.arm b/ci/docker/Dockerfile.build.arm
index 59022c8..6b49f4f 100644
--- a/ci/docker/Dockerfile.build.arm
+++ b/ci/docker/Dockerfile.build.arm
@@ -64,13 +64,25 @@ RUN curl -o armv6-eabihf--glibc--stable-2020.02-2.tar.bz2 
-L https://toolchains.
     rm armv6-eabihf--glibc--stable-2020.02-2.tar.bz2
 ENV 
CMAKE_TOOLCHAIN_FILE=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/share/buildroot/toolchainfile.cmake
 
-RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+RUN git clone --recursive -b v0.3.12 https://github.com/xianyi/OpenBLAS.git && 
\
     cd /usr/local/OpenBLAS && \
     make NOFORTRAN=1 NO_SHARED=1 
CC=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-gcc && \
     make 
PREFIX=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/arm-buildroot-linux-gnueabihf/sysroot
 NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 
+RUN git clone --recursive -b v1.2.11 https://github.com/madler/zlib.git && \
+    cd /usr/local/zlib && \
+    CHOST=arm \
+    CC=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-gcc \
+    AR=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-ar \
+    
RANLIB=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/bin/arm-linux-ranlib \
+    ./configure --static 
--prefix=/usr/local/armv6-eabihf--glibc--stable-2020.02-2/arm-buildroot-linux-gnueabihf/sysroot
 && \
+    make -j$(nproc) && \
+    make install && \
+    cd /usr/local && \
+    rm -rf zlib
+
 WORKDIR /work/mxnet
 
 
@@ -91,13 +103,25 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive 
apt-get install -y \
 COPY toolchains/arm-linux-gnueabihf-toolchain.cmake /usr/local
 ENV CMAKE_TOOLCHAIN_FILE=/usr/local/arm-linux-gnueabihf-toolchain.cmake
 
-RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+RUN git clone --recursive -b v0.3.12 https://github.com/xianyi/OpenBLAS.git && 
\
     cd /usr/local/OpenBLAS && \
     make NOFORTRAN=1 NO_SHARED=1 CC=arm-linux-gnueabihf-gcc && \
     make PREFIX=/usr/local/arm-linux-gnueabihf NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 
+RUN git clone --recursive -b v1.2.11 https://github.com/madler/zlib.git && \
+    cd /usr/local/zlib && \
+    CHOST=arm \
+    CC=arm-linux-gnueabihf-gcc \
+    AR=arm-linux-gnueabihf-ar \
+    RANLIB=arm-linux-gnueabihf-ranlib \
+    ./configure --static --prefix=/usr/local/arm-linux-gnueabihf && \
+    make -j$(nproc) && \
+    make install && \
+    cd /usr/local && \
+    rm -rf zlib
+
 WORKDIR /work/mxnet
 
 
@@ -118,11 +142,23 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive 
apt-get install -y \
 COPY toolchains/aarch64-linux-gnu-toolchain.cmake /usr
 ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
 
-RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+RUN git clone --recursive -b v0.3.12 https://github.com/xianyi/OpenBLAS.git && 
\
     cd /usr/local/OpenBLAS && \
     make NOFORTRAN=1 NO_SHARED=1 CC=aarch64-linux-gnu-gcc && \
     make PREFIX=/usr/aarch64-linux-gnu NO_SHARED=1 install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 
+RUN git clone --recursive -b v1.2.11 https://github.com/madler/zlib.git && \
+    cd /usr/local/zlib && \
+    CHOST=arm \
+    CC=aarch64-linux-gnu-gcc \
+    AR=aarch64-linux-gnu-ar \
+    RANLIB=aarch64-linux-gnu-ranlib \
+    ./configure --static --prefix=/usr/aarch64-linux-gnu && \
+    make -j$(nproc) && \
+    make install && \
+    cd /usr/local && \
+    rm -rf zlib
+
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.jetson 
b/ci/docker/Dockerfile.build.jetson
index 93c5558..908ec69 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -51,13 +51,25 @@ RUN /work/deb_ubuntu_ccache.sh
 COPY toolchains/aarch64-linux-gnu-toolchain.cmake /usr
 ENV CMAKE_TOOLCHAIN_FILE=/usr/aarch64-linux-gnu-toolchain.cmake
 
-RUN git clone --recursive -b v0.3.9 https://github.com/xianyi/OpenBLAS.git && \
+RUN git clone --recursive -b v0.3.12 https://github.com/xianyi/OpenBLAS.git && 
\
     cd /usr/local/OpenBLAS && \
     make NOFORTRAN=1 CC=aarch64-linux-gnu-gcc && \
     make PREFIX=/usr/aarch64-linux-gnu install && \
     cd /usr/local && \
     rm -rf OpenBLAS
 
+RUN git clone --recursive -b v1.2.11 https://github.com/madler/zlib.git && \
+    cd /usr/local/zlib && \
+    CHOST=arm \
+    CC=aarch64-linux-gnu-gcc \
+    AR=aarch64-linux-gnu-ar \
+    RANLIB=aarch64-linux-gnu-ranlib \
+    ./configure --static --prefix=/usr/aarch64-linux-gnu && \
+    make -j$(nproc) && \
+    make install && \
+    cd /usr/local && \
+    rm -rf zlib
+
 # Install aarch64 cross depedencies based on Jetpack 4.4
 # Dependencies require cuda-toolkit-10.2 which isn't installed in nvidia 
docker container
 # It contains cuda-compat instead. However deb files currently depend on 
cuda-toolkit alone.
diff --git a/docs/python_docs/python/tutorials/packages/np/np-vs-numpy.md 
b/docs/python_docs/python/tutorials/packages/np/np-vs-numpy.md
index 60c87af..49be803 100644
--- a/docs/python_docs/python/tutorials/packages/np/np-vs-numpy.md
+++ b/docs/python_docs/python/tutorials/packages/np/np-vs-numpy.md
@@ -91,14 +91,19 @@ b = a[0]
 b.ndim, b.size, isinstance(b, np.ndarray)
 ```
 
-## Save
+## Save and load
 
-The `save` method in `mxnet.np` saves data into a binary format that's not 
compatible with NumPy format. For example, it contains the device information. 
(TODO, needs more discussion here.) 
+Users can use the `npx.save`, `npx.savez` and `npx.load` methods respectively 
to
+save and load arrays. `npx.save` saves single, dense arrays to the `.npy`
+format, whereas `npx.savez` can save a collection of both dense and sparse
+arrays to the `.npz` format.
 
 ```{.python .input}
 a = np.array(1, ctx=gpu)
 npx.save('a', a)
 npx.load('a')
+npx.savez('a', a=a, b=a*2)
+npx.load('a')
 ```
 
 ## Matplotlib
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 145eef1..6e062af 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -712,6 +712,18 @@ MXNET_DLL int MXNDArraySaveRawBytes(NDArrayHandle handle,
  * \param keys the name of the NDArray, optional, can be NULL
  * \return 0 when success, -1 when failure happens
  */
+MXNET_DLL int MXNDArrayLegacySave(const char* fname,
+                                  uint32_t num_args,
+                                  NDArrayHandle* args,
+                                  const char** keys);
+/*!
+ * \brief Save list of narray into the file.
+ * \param fname name of the file.
+ * \param num_args number of arguments to save.
+ * \param args the array of NDArrayHandles to be saved.
+ * \param keys the name of the NDArray, optional, can be NULL
+ * \return 0 when success, -1 when failure happens
+ */
 MXNET_DLL int MXNDArraySave(const char* fname,
                             uint32_t num_args,
                             NDArrayHandle* args,
diff --git a/licenses/MIT b/licenses/MIT
index e69de29..8e49eed 100644
--- a/licenses/MIT
+++ b/licenses/MIT
@@ -0,0 +1,19 @@
+The MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index abe3ce9..8ca4b5a 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -370,8 +370,10 @@ class Block:
             params = {v: k for k, v in reverse_params.items()}
 
         arg_dict = {key: val._reduce() for key, val in params.items()}
-        save_fn = _mx_npx.save if is_np_array() else ndarray.save
-        save_fn(filename, arg_dict)
+        if is_np_array():
+            _mx_npx.savez(filename, **arg_dict)
+        else:
+            ndarray.save(filename, arg_dict)
 
     def load_parameters(self, filename, ctx=None, allow_missing=False,
                         ignore_extra=False, cast_dtype=False, 
dtype_source='current'):
@@ -1438,12 +1440,14 @@ class HybridBlock(Block):
                                       .format(name=name), stacklevel=3)
                     else:
                         arg_dict['aux:%s'%name] = param._reduce()
-        save_fn = _mx_npx.save if is_np_array() else ndarray.save
         params_filename = '%s-%04d.params'%((path if path is not None else 
""), epoch)
 
         if path is not None:
-            save_fn(params_filename, arg_dict)
-            return (sym_filename, params_filename)
+            if is_np_array():
+                _mx_npx.savez(params_filename, **arg_dict)
+            else:
+                ndarray.save(params_filename, arg_dict)
+            return (sym_filename, params_filename if arg_dict else None)
 
         if remove_amp_cast:
             handle = SymbolHandle()
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
index 730f217..48f84b5 100644
--- a/python/mxnet/ndarray/utils.py
+++ b/python/mxnet/ndarray/utils.py
@@ -222,12 +222,6 @@ def load_frombuffer(buf):
 def save(fname, data):
     """Saves a list of arrays or a dict of str->array to file.
 
-    Examples of filenames:
-
-    - ``/path/to/file``
-    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
-    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
-
     Parameters
     ----------
     fname : str
@@ -274,7 +268,4 @@ def save(fname, data):
     else:
         raise ValueError("data needs to either be a NDArray, dict of str, 
NDArray pairs "
                          "or a list of NDarrays.")
-    check_call(_LIB.MXNDArraySave(c_str(fname),
-                                  mx_uint(len(handles)),
-                                  handles,
-                                  keys))
+    check_call(_LIB.MXNDArrayLegacySave(c_str(fname), mx_uint(len(handles)), 
handles, keys))
diff --git a/python/mxnet/numpy_extension/utils.py 
b/python/mxnet/numpy_extension/utils.py
index 6d3f25b..886f557 100644
--- a/python/mxnet/numpy_extension/utils.py
+++ b/python/mxnet/numpy_extension/utils.py
@@ -26,61 +26,103 @@ from ..base import c_handle_array, c_str, mx_uint, 
NDArrayHandle, py_str
 from ..dlpack import ndarray_to_dlpack_for_read, ndarray_to_dlpack_for_write
 from ..dlpack import ndarray_from_dlpack, ndarray_from_numpy
 from ..numpy import ndarray, array
+from ..ndarray import NDArray
 
-__all__ = ['save', 'load', 'to_dlpack_for_read', 'to_dlpack_for_write',
+__all__ = ['save', 'savez', 'load', 'to_dlpack_for_read', 
'to_dlpack_for_write',
            'from_dlpack', 'from_numpy']
 
 def save(file, arr):
-    """Saves a list of `ndarray`s or a dict of `str`->`ndarray` to file.
+    """Save an array to a binary file in NumPy ``.npy`` format.
 
-    Examples of filenames:
+    Parameters
+    ----------
+    file : str
+        File or filename to which the data is saved.  If file is a file-object,
+        then the filename is unchanged.
+    arr : ndarray
+        Array data to be saved. Sparse formats are not supported. Please use
+        savez function to save sparse arrays.
+
+    See Also
+    --------
+    savez : Save several arrays into a ``.npz`` archive
+
+    Notes
+    -----
+    For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
+    """
+    if not isinstance(arr, NDArray):
+        raise ValueError("data needs to either be a MXNet ndarray")
+    arr = [arr]
+    keys = None
+    handles = c_handle_array(arr)
+    check_call(_LIB.MXNDArraySave(c_str(file), mx_uint(len(handles)), handles, 
keys))
+
+
+def savez(file, *args, **kwds):
+    """Save several arrays into a single file in uncompressed ``.npz`` format.
 
-    - ``/path/to/file``
-    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
-    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
+    If arguments are passed in with no keywords, the corresponding variable
+    names, in the ``.npz`` file, are 'arr_0', 'arr_1', etc. If keyword
+    arguments are given, the corresponding variable names, in the ``.npz``
+    file will match the keyword names.
 
     Parameters
     ----------
     file : str
-        Filename to which the data is saved.
-    arr : `ndarray` or list of `ndarray`s or dict of `str` to `ndarray`
-        The data to be saved.
+        Either the filename (string) or an open file (file-like object)
+        where the data will be saved.
+    args : Arguments, optional
+        Arrays to save to the file. Since it is not possible for Python to
+        know the names of the arrays outside `savez`, the arrays will be saved
+        with names "arr_0", "arr_1", and so on. These arguments can be any
+        expression.
+    kwds : Keyword arguments, optional
+        Arrays to save to the file. Arrays will be saved in the file with the
+        keyword names.
+
+    Returns
+    -------
+    None
+
+    See Also
+    --------
+    save : Save a single array to a binary file in NumPy format.
 
     Notes
     -----
-    This function can only be called within numpy semantics, i.e., 
`npx.is_np_shape()`
-    and `npx.is_np_array()` must both return true.
+    The ``.npz`` file format is a zipped archive of files named after the
+    variables they contain.  The archive is not compressed and each file
+    in the archive contains one variable in ``.npy`` format. For a
+    description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
+
+    When opening the saved ``.npz`` file with `load` a dictionary object
+    mapping file-names to the arrays themselves.
+
+    When saving dictionaries, the dictionary keys become filenames
+    inside the ZIP archive. Therefore, keys should be valid filenames.
+    E.g., avoid keys that begin with ``/`` or contain ``.``.
     """
-    if not (is_np_shape() and is_np_array()):
-        raise ValueError('Cannot save `mxnet.numpy.ndarray` in legacy mode. 
Please activate'
-                         ' numpy semantics by calling `npx.set_np()` in the 
global scope'
-                         ' before calling this function.')
-    if isinstance(arr, ndarray):
-        arr = [arr]
-    if isinstance(arr, dict):
-        str_keys = arr.keys()
-        nd_vals = arr.values()
-        if any(not isinstance(k, string_types) for k in str_keys) or \
-                any(not isinstance(v, ndarray) for v in nd_vals):
-            raise TypeError('Only accepts dict str->ndarray or list of 
ndarrays')
-        keys = c_str_array(str_keys)
-        handles = c_handle_array(nd_vals)
-    elif isinstance(arr, list):
-        if any(not isinstance(v, ndarray) for v in arr):
-            raise TypeError('Only accepts dict str->ndarray or list of 
ndarrays')
-        keys = None
-        handles = c_handle_array(arr)
-    else:
-        raise ValueError("data needs to either be a ndarray, dict of (str, 
ndarray) pairs "
-                         "or a list of ndarrays.")
-    check_call(_LIB.MXNDArraySave(c_str(file),
-                                  mx_uint(len(handles)),
-                                  handles,
-                                  keys))
+
+    if len(args):
+        for i, arg in enumerate(args):
+            name = 'arr_{}'.format(str(i))
+            assert name not in kwds, 'Naming conflict between arg {} and 
kwargs.'.format(str(i))
+            kwds[name] = arg
+
+    str_keys = kwds.keys()
+    nd_vals = kwds.values()
+    if any(not isinstance(k, string_types) for k in str_keys) or \
+            any(not isinstance(v, NDArray) for v in nd_vals):
+        raise TypeError('Only accepts dict str->ndarray or list of ndarrays')
+
+    keys = c_str_array(str_keys)
+    handles = c_handle_array(nd_vals)
+    check_call(_LIB.MXNDArraySave(c_str(file), mx_uint(len(handles)), handles, 
keys))
 
 
 def load(file):
-    """Loads an array from file.
+    """Load arrays from ``.npy``, ``.npz`` or legacy MXNet file format.
 
     See more details in ``save``.
 
@@ -115,7 +157,9 @@ def load(file):
                                   ctypes.byref(out_name_size),
                                   ctypes.byref(names)))
     if out_name_size.value == 0:
-        return [ndarray(NDArrayHandle(handles[i])) for i in 
range(out_size.value)]
+        if out_size.value != 1:
+            return [ndarray(NDArrayHandle(handles[i])) for i in 
range(out_size.value)]
+        return ndarray(NDArrayHandle(handles[0]))
     else:
         assert out_name_size.value == out_size.value
         return dict(
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 959f2e0..046ff76 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -58,7 +58,9 @@
 #include "../operator/subgraph/subgraph_property.h"
 #include "../common/utils.h"
 #include "../profiler/profiler.h"
+#include "../serialization/cnpy.h"
 #include "nnvm/pass_functions.h"
+#include "zip.h"
 
 using namespace mxnet;
 
@@ -1876,10 +1878,10 @@ int MXNDArrayWaitAll() {
   API_END();
 }
 
-int MXNDArraySave(const char* fname,
-                  uint32_t num_args,
-                  NDArrayHandle* args,
-                  const char** keys) {
+int MXNDArrayLegacySave(const char* fname,
+                        uint32_t num_args,
+                        NDArrayHandle* args,
+                        const char** keys) {
   API_BEGIN();
   std::vector<NDArray> data(num_args);
   std::vector<std::string> names;
@@ -1899,6 +1901,36 @@ int MXNDArraySave(const char* fname,
   API_END();
 }
 
+int MXNDArraySave(const char* fname,
+                  uint32_t num_args,
+                  NDArrayHandle* args,
+                  const char** keys) {
+  API_BEGIN();
+
+  CHECK_NOTNULL(fname);
+
+  if (num_args == 1 && keys == nullptr) {
+      NDArray *array = static_cast<NDArray *>(args[0]);
+      if (array->storage_type() == kDefaultStorage) {
+          npy::save_array(fname, *array);
+      } else {
+          int write_mode = ZIP_TRUNCATE | ZIP_CREATE;
+          npz::save_array(write_mode, fname, "", *array);
+      }
+  } else {
+      int write_mode = ZIP_TRUNCATE | ZIP_CREATE;
+      for (uint32_t i = 0; i < num_args; ++i) {
+          NDArray *array = static_cast<NDArray *>(args[i]);
+          const std::string array_key = keys == nullptr ? "arr_" + 
std::to_string(i) : keys[i];
+          npz::save_array(write_mode, fname, array_key, *array);
+
+          // Append to the created zip file going forward
+          write_mode = 0;
+      }
+  }
+  API_END();
+}
+
 int MXNDArrayLoad(const char* fname,
                   uint32_t *out_size,
                   NDArrayHandle** out_arr,
@@ -1907,26 +1939,63 @@ int MXNDArrayLoad(const char* fname,
   MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
   ret->ret_vec_str.clear();
   API_BEGIN();
-  std::vector<NDArray> data;
-  std::vector<std::string> &names = ret->ret_vec_str;
+
+  uint32_t magic;
   {
-    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
-    mxnet::NDArray::Load(fi.get(), &data, &names);
-  }
-  ret->ret_handles.resize(data.size());
-  for (size_t i = 0; i < data.size(); ++i) {
-    NDArray *ptr = new NDArray();
-    *ptr = data[i];
-    ret->ret_handles[i] = ptr;
-  }
-  ret->ret_vec_charp.resize(names.size());
-  for (size_t i = 0; i < names.size(); ++i) {
-    ret->ret_vec_charp[i] = names[i].c_str();
+      std::unique_ptr<dmlc::Stream> strm(dmlc::Stream::Create(fname, "r"));
+      CHECK_EQ(strm->Read(&magic, sizeof(uint32_t)), sizeof(uint32_t))
+        << "Failed to read 32 bits from file.";
+  }
+
+  if (magic == 0x04034b50 || magic == 0x504b0304) {  // zip file format; 
assumed to be npz
+      auto[data, names] = npz::load_arrays(fname);
+      ret->ret_handles.resize(data.size());
+      for (size_t i = 0; i < data.size(); ++i) {
+          NDArray *ptr = new NDArray();
+          *ptr = data[i];
+          ret->ret_handles[i] = ptr;
+      }
+      ret->ret_vec_str.resize(names.size());
+      for (size_t i = 0; i < names.size(); ++i) {
+          ret->ret_vec_str[i] = names[i];
+      }
+      ret->ret_vec_charp.resize(names.size());
+      for (size_t i = 0; i < names.size(); ++i) {
+          ret->ret_vec_charp[i] = ret->ret_vec_str[i].c_str();
+      }
+      *out_size = static_cast<uint32_t>(data.size());
+      *out_arr = dmlc::BeginPtr(ret->ret_handles);
+      *out_name_size = static_cast<uint32_t>(names.size());
+      *out_names = dmlc::BeginPtr(ret->ret_vec_charp);
+  } else if (magic == 0x4d554e93 || magic == 0x934e554d) {  // first bytes of 
npy format
+      *out_size = 1;
+      ret->ret_handles.resize(1);
+      NDArray *ptr = new NDArray();
+      *ptr = npy::load_array(fname);  // Only supports local filesystem at 
this point in time
+      ret->ret_handles[0] = ptr;
+      *out_arr = dmlc::BeginPtr(ret->ret_handles);
+  } else {
+      std::vector<NDArray> data;
+      std::vector<std::string> &names = ret->ret_vec_str;
+      {
+          std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
+          mxnet::NDArray::Load(fi.get(), &data, &names);
+      }
+      ret->ret_handles.resize(data.size());
+      for (size_t i = 0; i < data.size(); ++i) {
+          NDArray *ptr = new NDArray();
+          *ptr = data[i];
+          ret->ret_handles[i] = ptr;
+      }
+      ret->ret_vec_charp.resize(names.size());
+      for (size_t i = 0; i < names.size(); ++i) {
+          ret->ret_vec_charp[i] = names[i].c_str();
+      }
+      *out_size = static_cast<uint32_t>(data.size());
+      *out_arr = dmlc::BeginPtr(ret->ret_handles);
+      *out_name_size = static_cast<uint32_t>(names.size());
+      *out_names = dmlc::BeginPtr(ret->ret_vec_charp);
   }
-  *out_size = static_cast<uint32_t>(data.size());
-  *out_arr = dmlc::BeginPtr(ret->ret_handles);
-  *out_name_size = static_cast<uint32_t>(names.size());
-  *out_names = dmlc::BeginPtr(ret->ret_vec_charp);
   API_END();
 }
 
diff --git a/src/serialization/cnpy.cc b/src/serialization/cnpy.cc
new file mode 100644
index 0000000..0ca5a06
--- /dev/null
+++ b/src/serialization/cnpy.cc
@@ -0,0 +1,970 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// File is based on https://github.com/leezu/cnpy/tree/libzip released under 
MIT License
+// Copyright (C) 2011  Carl Rogers, 2018 Leonard Lausen
+
+#include "cnpy.h"
+#include <mxnet/op_attr_types.h>
+#include <mxnet/imperative.h>
+#include <string_view>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <fstream>
+#include <complex>
+#include <numeric>
+#include <limits>
+#include <regex>
+#include <tuple>
+#include <set>
+#include <stdexcept>
+#include <typeinfo>
+
+#include "zip.h"
+
+
+
+namespace mxnet {
+
+void fortran_order_transpose_prepare(std::vector<dim_t>& shape) {  // 
NOLINT(runtime/references)
+  std::reverse(std::begin(shape), std::end(shape));
+}
+
+// NOLINTNEXTLINE(runtime/references)
+NDArray fortran_order_transpose(std::vector<dim_t>& shape, int type_flag, 
NDArray& array) {
+  std::reverse(std::begin(shape), std::end(shape));
+  TShape tshape(shape);
+  NDArray transposed(tshape, Context::CPU(), false, type_flag);
+  const std::vector<NDArray*> inputs {&array};
+  const std::vector<NDArray*> outputs {&transposed};
+  const std::vector<OpReqType> reqs {kWriteTo};  // Transpose does not support 
kWriteInplace
+  nnvm::NodeAttrs attrs;
+  if (!Imperative::Get()->is_np_shape()) {
+    attrs.op = nnvm::Op::Get("transpose");
+  } else {
+    attrs.op = nnvm::Op::Get("_npi_transpose");
+  }
+  attrs.op->attr_parser(&attrs);
+  Imperative::Get()->InvokeOp(Context::CPU(), attrs, inputs, outputs,
+                              reqs, DispatchMode::kFCompute, OpStatePtr());
+  return transposed;
+}
+
+
+namespace npy {
+
+#if (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
+     __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define MXNET_BYTEORDER "<"
+#define MXNET_BYTEORDER_CHAR '<'
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
+    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MXNET_BYTEORDER ">"
+#define MXNET_BYTEORDER_CHAR '>'
+#elif defined(_WIN32)
+#define MXNET_BYTEORDER "<"
+#define MXNET_BYTEORDER_CHAR '<'
+#else
+#error "endian detection needs to be set up for your compiler"
+#endif
+
+std::string dtype_descr(const TBlob& blob) {
+  switch (blob.type_flag_) {
+    case mshadow::kFloat16: return "'" MXNET_BYTEORDER "f2'";
+    case mshadow::kFloat32: return "'" MXNET_BYTEORDER "f4'";
+    case mshadow::kFloat64: return "'" MXNET_BYTEORDER "f8'";
+    case mshadow::kInt8: return "'|i1'";
+    case mshadow::kInt16: return "'" MXNET_BYTEORDER "i2'";
+    case mshadow::kInt32: return "'" MXNET_BYTEORDER "i4'";
+    case mshadow::kInt64: return "'" MXNET_BYTEORDER "i8'";
+    case mshadow::kBool: return "'|b1'";
+    case mshadow::kUint8: return "'|u1'";
+    case mshadow::kUint16: return "'" MXNET_BYTEORDER "u2'";
+    case mshadow::kUint32: return "'" MXNET_BYTEORDER "u4'";
+    case mshadow::kUint64: return "'" MXNET_BYTEORDER "u8'";
+    case mshadow::kBfloat16: return "[('bfloat16', '" MXNET_BYTEORDER "u2')]";
+    default: {
+      LOG(FATAL) << "Unknown dtype type " << blob.type_flag_ << "encountered.";
+      return "";
+    }
+  }
+}
+
+
+int dtype_descr(const std::string& dtype_descr) {
+    if (dtype_descr.find("f2'") != std::string::npos) return mshadow::kFloat16;
+    else if (dtype_descr.find("f4'") != std::string::npos) return 
mshadow::kFloat32;
+    else if (dtype_descr.find("f8'") != std::string::npos) return 
mshadow::kFloat64;
+    else if (dtype_descr.find("|i1'") != std::string::npos) return 
mshadow::kInt8;
+    else if (dtype_descr.find("i2'") != std::string::npos) return 
mshadow::kInt16;
+    else if (dtype_descr.find("i4'") != std::string::npos) return 
mshadow::kInt32;
+    else if (dtype_descr.find("i8'") != std::string::npos) return 
mshadow::kInt64;
+    else if (dtype_descr.find("|b1'") != std::string::npos) return 
mshadow::kBool;
+    else if (dtype_descr.find("|u1'") != std::string::npos) return 
mshadow::kUint8;
+    else if (dtype_descr.find("u2'") != std::string::npos) return 
mshadow::kUint16;
+    else if (dtype_descr.find("u4'") != std::string::npos) return 
mshadow::kUint32;
+    else if (dtype_descr.find("u8'") != std::string::npos) return 
mshadow::kUint64;
+    else if (dtype_descr.find("bfloat16'") != std::string::npos) return 
mshadow::kBfloat16;
+    else
+      LOG(FATAL) << "Unknown dtype descriptor " << dtype_descr << 
"encountered.";
+    return -1;
+}
+
+std::string create_npy_header(const TBlob& blob) {
+  std::string dict;
+  dict += "{'descr': ";
+  dict += dtype_descr(blob);
+  dict += ", 'fortran_order': False, 'shape': (";
+  if (blob.ndim()) {
+    dict += std::to_string(blob.shape_[0]);
+    for (int i = 1; i < blob.ndim(); i++) {
+      dict += ", ";
+      dict += std::to_string(blob.shape_[i]);
+    }
+    if (blob.ndim() == 1) {
+      dict += ",";
+    }
+  }
+  dict += "), }";
+
+  // pad with spaces so that preamble+dict is modulo 64 bytes. preamble is
+  // 10 bytes. dict needs to end with \n
+  int remainder = 64 - (10 + dict.size() + 1) % 64;
+  dict.insert(dict.end(), remainder, ' ');
+  dict.push_back('\n');
+  assert((dict.size() + 10) % 64 == 0);
+
+  std::string header;
+  header += static_cast<char>(0x93);
+  header += "NUMPY";
+
+  std::string::size_type size = dict.size();
+  CHECK(size <= std::numeric_limits<uint32_t>::max()) << "Shape too large for 
NPY serialization";
+  if (size <= std::numeric_limits<uint16_t>::max()) {
+    header += static_cast<char>(0x01);  // major version of numpy format
+    header += static_cast<char>(0x00);  // minor version of numpy format
+    uint16_t size_ = dict.size();
+    header += static_cast<char>(size_ & 0xFF);
+    header += static_cast<char>(size_ >> 8);
+  } else {
+    header += static_cast<char>(0x02);  // major version of numpy format
+    header += static_cast<char>(0x00);  // minor version of numpy format
+    uint32_t size_ = dict.size();
+    header += static_cast<char>(size_ & 0xFF);
+    header += static_cast<char>((size_ >> 8) & 0xFF);
+    header += static_cast<char>((size_ >> 16) & 0xFF);
+    header += static_cast<char>((size_ >> 24) & 0xFF);
+  }
+
+  header += dict;
+
+  return header;
+}
+
+uint32_t parse_npy_header_len(std::ifstream& strm) {
+  strm.exceptions(std::istream::eofbit);
+  strm.exceptions(std::istream::failbit);
+  strm.exceptions(std::istream::badbit);
+
+  CHECK_EQ(strm.get(), 0x93);
+  CHECK_EQ(strm.get(), 'N');
+  CHECK_EQ(strm.get(), 'U');
+  CHECK_EQ(strm.get(), 'M');
+  CHECK_EQ(strm.get(), 'P');
+  CHECK_EQ(strm.get(), 'Y');
+
+  uint8_t major_version = strm.get();
+  CHECK(major_version == 0x01 || major_version == 0x02) << "Unsupported npy 
major version";
+  CHECK(strm.get() == 0x00) << "Unsupported npy minor version";
+
+  uint32_t header_len = 0;
+  header_len += strm.get();
+  header_len += strm.get() >> 8;
+  if (major_version == 0x02) {
+    header_len += strm.get() >> 16;
+    header_len += strm.get() >> 24;
+  }
+  return header_len;
+}
+
+std::tuple<int, int, std::vector<dim_t>> parse_npy_header_descr(const 
std::string& header) {
+  // Fortran order
+  std::string::size_type loc = header.find("fortran_order");
+  CHECK_NE(loc, std::string::npos) << "failed to find NPY header keyword: 
'fortran_order'";
+  bool fortran_order = (header.substr(loc + 16, 4) == "True" ? true : false);
+
+  // Shape
+  loc = header.find('(');
+  std::string::size_type end_loc = header.find(')');
+  CHECK_NE(loc, std::string::npos) << "failed to find NPY header keyword: '('";
+  CHECK_NE(end_loc, std::string::npos) << "failed to find NPY header keyword: 
')'";
+  std::string shape_str = header.substr(loc+1, end_loc-loc-1);
+  std::regex num_regex("[0-9][0-9]*");
+  std::smatch sm;
+  std::vector<dim_t> shape;
+  while (std::regex_search(shape_str, sm, num_regex)) {
+    shape.push_back(std::stoi(sm[0].str()));
+    shape_str = sm.suffix().str();
+  }
+
+  // endian, word size, data type
+  // byte order code | stands for not applicable.
+  loc = header.find("descr");
+  CHECK_NE(loc, std::string::npos) << "failed to find NPY header keyword: 
'descr'";
+  // May use 
https://github.com/numpy/numpy/blob/38275835/numpy/core/src/multiarray/ctors.c#L365
+  CHECK(header[loc + 9] == MXNET_BYTEORDER_CHAR || header[loc + 9] == '|')
+    << "Loading files with non-native endianness "
+    << "is not yet supported. Please open the file "
+    << "with numpy.load, use byteswap method to "
+    << "convert endianness and re-save the file.";
+
+  int type_flag = dtype_descr(header);
+  return std::tuple(type_flag, fortran_order, shape);
+}
+
+
+void save_array(const std::string& fname, const NDArray& array_) {
+  NDArray array;  // a copy on cpu
+  if (array_.ctx().dev_mask() != cpu::kDevMask) {
+    array = array_.Copy(Context::CPU());
+    array.WaitToRead();
+  } else {
+    array = array_;
+    array.WaitToRead();
+#if MXNET_USE_MKLDNN == 1
+    if (array.IsMKLDNNData()) {
+      array = array.Reorder2Default();
+    }
+#endif
+  }
+
+  CHECK_EQ(array.storage_type(), kDefaultStorage);
+
+  const TBlob& blob = array.data();
+  std::string npy_header = create_npy_header(blob);
+
+  std::ofstream output(fname, std::ios::binary);
+  output.write(npy_header.data(), npy_header.size());
+  output.write(static_cast<const char*>(blob.dptr_), blob.Size() *
+               mshadow::mshadow_sizeof(blob.type_flag_));
+}
+
+NDArray load_array(const std::string& fname) {
+  std::ifstream strm(fname, std::ios::binary);
+  strm.exceptions(std::istream::eofbit);
+  strm.exceptions(std::istream::failbit);
+  strm.exceptions(std::istream::badbit);
+
+  uint32_t header_len = parse_npy_header_len(strm);
+  std::string header(header_len, ' ');
+  strm.read(header.data(), header_len);
+  auto[type_flag, fortran_order, shape] = parse_npy_header_descr(header);
+
+  if (fortran_order) {
+    fortran_order_transpose_prepare(shape);
+  }
+
+  TShape tshape(shape);
+  NDArray array(tshape, Context::CPU(), false, type_flag);
+  const TBlob& blob = array.data();
+  strm.read(reinterpret_cast<char*>(blob.dptr_), blob.Size() *
+            mshadow::mshadow_sizeof(blob.type_flag_));
+
+  if (fortran_order) {
+    array = fortran_order_transpose(shape, type_flag, array);
+  }
+
+  return array;
+}
+
+}  // namespace npy
+
+namespace npz {
+
+
+void save_blob(int zip_open_flags, const std::string& zip_fname, const 
std::string& blob_name,
+               const TBlob& blob) {
+  int error;
+  zip_t* archive = zip_open(zip_fname.c_str(), zip_open_flags, &error);
+  if (archive == nullptr) {
+    zip_error_t e;
+    zip_error_init_with_code(&e, error);
+    throw std::runtime_error(zip_error_strerror(&e));
+  }
+
+  std::string npy_header = npy::create_npy_header(blob);
+
+  // Declare buffers from making up the .npy file
+  std::array<zip_buffer_fragment_t, 2> fragments;
+  fragments[0].data = reinterpret_cast<zip_uint8_t*>(npy_header.data());
+  fragments[0].length = npy_header.size();
+  fragments[1].data = reinterpret_cast<zip_uint8_t*>(blob.dptr_);
+  fragments[1].length = blob.Size() * mshadow::mshadow_sizeof(blob.type_flag_);
+
+  zip_error_t e;
+  zip_source_t* source =
+      zip_source_buffer_fragment_create(fragments.data(), fragments.size(), 0, 
&e);
+  if (source == nullptr) {
+      throw std::runtime_error(zip_error_strerror(&e));
+  }
+  zip_int64_t index = zip_file_add(archive, (blob_name + ".npy").data(), 
source, ZIP_FL_ENC_UTF_8);
+  if (index < 0) {
+    zip_source_free(source);
+    throw std::runtime_error(zip_strerror(archive));
+  }
+
+  // Write everything
+  error = zip_close(archive);
+  if (error != 0) {
+    std::string strerror{zip_strerror(archive)};
+    zip_discard(archive);
+    throw std::runtime_error(strerror);
+  }
+}
+
+
+// Save shape of sparse ndarray in to scipy compatible shape.npy with int64 
data
+void save_shape_array(int zip_open_flags, const std::string& zip_fname,
+                      const std::string& blob_name, const mxnet::TShape& 
shape) {
+  int error;
+  zip_t* archive = zip_open(zip_fname.c_str(), zip_open_flags, &error);
+  if (archive == nullptr) {
+    zip_error_t e;
+    zip_error_init_with_code(&e, error);
+    throw std::runtime_error(zip_error_strerror(&e));
+  }
+
+  // Special case of create_npy_header for TShape data
+  std::string dict;
+  dict += "{'descr': '<i8', 'fortran_order': False, 'shape': (";
+  dict += std::to_string(shape.ndim());
+  dict += ",), }";
+  // pad with spaces so that preamble+dict is modulo 64 bytes. preamble is
+  // 10 bytes. dict needs to end with \n
+  int remainder = 64 - (10 + dict.size() + 1) % 64;
+  dict.insert(dict.end(), remainder, ' ');
+  dict.push_back('\n');
+  assert((dict.size() + 10) % 64 == 0);
+  std::string npy;
+  npy += static_cast<char>(0x93);
+  npy += "NUMPY";
+  std::string::size_type size = dict.size();
+  CHECK(size <= std::numeric_limits<uint32_t>::max()) << "Shape too large for 
NPY serialization";
+  if (size <= std::numeric_limits<uint16_t>::max()) {
+      npy += static_cast<char>(0x01);  // major version of numpy format
+      npy += static_cast<char>(0x00);  // minor version of numpy format
+      uint16_t size_ = dict.size();
+      npy += static_cast<char>(size_ & 0xFF);
+      npy += static_cast<char>(size_ >> 8);
+  } else {
+      npy += static_cast<char>(0x02);  // major version of numpy format
+      npy += static_cast<char>(0x00);  // minor version of numpy format
+      uint32_t size_ = dict.size();
+      npy += static_cast<char>(size_ & 0xFF);
+      npy += static_cast<char>((size_ >> 8) & 0xFF);
+      npy += static_cast<char>((size_ >> 16) & 0xFF);
+      npy += static_cast<char>((size_ >> 24) & 0xFF);
+  }
+  npy += dict;
+
+  // Add shape data
+  for (const uint64_t value : shape) {
+      npy += static_cast<char>(value & 0xFF);
+      npy += static_cast<char>((value >> 8) & 0xFF);
+      npy += static_cast<char>((value >> 16) & 0xFF);
+      npy += static_cast<char>((value >> 24) & 0xFF);
+      npy += static_cast<char>((value >> 32) & 0xFF);
+      npy += static_cast<char>((value >> 40) & 0xFF);
+      npy += static_cast<char>((value >> 48) & 0xFF);
+      npy += static_cast<char>((value >> 56) & 0xFF);
+  }
+
+  zip_error_t e;
+  zip_source_t* source = zip_source_buffer_create(npy.data(), npy.size(), 0, 
&e);
+  if (source == nullptr) {
+      throw std::runtime_error(zip_error_strerror(&e));
+  }
+  zip_int64_t index = zip_file_add(archive, (blob_name + ".npy").data(), 
source, ZIP_FL_ENC_UTF_8);
+  if (index < 0) {
+    zip_source_free(source);
+    throw std::runtime_error(zip_strerror(archive));
+  }
+
+  // Write everything
+  error = zip_close(archive);
+  if (error != 0) {
+    std::string strerror{zip_strerror(archive)};
+    zip_discard(archive);
+    throw std::runtime_error(strerror);
+  }
+}
+
+
+void save_format_array(int zip_open_flags, const std::string& zip_fname,
+                       const std::string& blob_name, const std::string_view& 
format) {
+  int error;
+  zip_t* archive = zip_open(zip_fname.c_str(), zip_open_flags, &error);
+  if (archive == nullptr) {
+    zip_error_t e;
+    zip_error_init_with_code(&e, error);
+    throw std::runtime_error(zip_error_strerror(&e));
+  }
+
+  // Special case of create_npy_header for TShape data
+  std::string dict;
+  dict += "{'descr': '|s";
+  dict += std::to_string(format.size());
+  dict += "{'descr': '<i8', 'fortran_order': False, 'shape': (), }";
+  // pad with spaces so that preamble+dict is modulo 64 bytes. preamble is
+  // 10 bytes. dict needs to end with \n
+  int remainder = 64 - (10 + dict.size() + 1) % 64;
+  dict.insert(dict.end(), remainder, ' ');
+  dict.push_back('\n');
+  assert((dict.size() + 10) % 64 == 0);
+  std::string npy;
+  npy += static_cast<char>(0x93);
+  npy += "NUMPY";
+  std::string::size_type size = dict.size();
+  CHECK(size <= std::numeric_limits<uint32_t>::max());
+  if (size <= std::numeric_limits<uint16_t>::max()) {
+      npy += static_cast<char>(0x01);  // major version of numpy format
+      npy += static_cast<char>(0x00);  // minor version of numpy format
+      uint16_t size_ = dict.size();
+      npy += static_cast<char>(size_ & 0xFF);
+      npy += static_cast<char>(size_ >> 8);
+  } else {
+      npy += static_cast<char>(0x02);  // major version of numpy format
+      npy += static_cast<char>(0x00);  // minor version of numpy format
+      uint32_t size_ = dict.size();
+      npy += static_cast<char>(size_ & 0xFF);
+      npy += static_cast<char>((size_ >> 8) & 0xFF);
+      npy += static_cast<char>((size_ >> 16) & 0xFF);
+      npy += static_cast<char>((size_ >> 24) & 0xFF);
+  }
+  npy += dict;
+
+  npy += format;
+
+  zip_error_t e;
+  zip_source_t* source = zip_source_buffer_create(npy.data(), npy.size(), 0, 
&e);
+  if (source == nullptr) {
+      throw std::runtime_error(zip_error_strerror(&e));
+  }
+  zip_int64_t index = zip_file_add(archive, (blob_name + ".npy").data(), 
source, ZIP_FL_ENC_UTF_8);
+  if (index < 0) {
+    zip_source_free(source);
+    throw std::runtime_error(zip_strerror(archive));
+  }
+
+  // Write everything
+  error = zip_close(archive);
+  if (error != 0) {
+    std::string strerror{zip_strerror(archive)};
+    zip_discard(archive);
+    throw std::runtime_error(strerror);
+  }
+}
+
+
+void save_array(int write_mode, const std::string& zip_fname, const 
std::string& array_name,
+                const NDArray& array_) {
+  NDArray array;  // a copy on cpu
+  if (array_.ctx().dev_mask() != cpu::kDevMask) {
+    array = array_.Copy(Context::CPU());
+    array.WaitToRead();
+  } else {
+    array = array_;
+    array.WaitToRead();
+#if MXNET_USE_MKLDNN == 1
+    if (array.IsMKLDNNData()) {
+      array = array.Reorder2Default();
+    }
+#endif
+  }
+
+  switch (array.storage_type()) {
+  case kDefaultStorage: {
+    save_blob(write_mode, zip_fname, array_name, array.data());
+    break;
+  }
+  case kCSRStorage: {
+    save_blob(write_mode, zip_fname, array_name + "/data", array.data());
+    write_mode = 0;  // Append to the created zip file going forward
+    save_blob(write_mode, zip_fname, array_name + "/indptr", 
array.aux_data(csr::kIndPtr));
+    save_blob(write_mode, zip_fname, array_name + "/indices", 
array.aux_data(csr::kIdx));
+    save_shape_array(write_mode, zip_fname, array_name + "/shape", 
array.shape());
+    save_format_array(write_mode, zip_fname, array_name + "/format", "csr");
+    break;
+  }
+  case kRowSparseStorage: {
+    save_blob(write_mode, zip_fname, array_name + "/data", array.data());
+    write_mode = 0;  // Append to the created zip file going forward
+    save_blob(write_mode, zip_fname, array_name + "/indices", 
array.aux_data(rowsparse::kIdx));
+    save_shape_array(write_mode, zip_fname, array_name + "/shape", 
array.shape());
+    save_format_array(write_mode, zip_fname, array_name + "/format", 
"row_sparse");
+    break;
+  }
+  default: LOG(FATAL) << "Unknown storage type " << array.storage_type() << 
"encountered.";
+  }
+}
+
+
+uint32_t parse_npy_header_len(zip_file_t* file, const std::string_view& fname,
+                              const std::string& zip_fname) {
+  std::array<char, 12> buffer;
+  zip_int64_t bytesread = zip_fread(file, buffer.data(), 10);
+  if (bytesread != 10) {
+    LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+  }
+  CHECK_EQ(buffer[0], (char)0x93);
+  CHECK_EQ(buffer[1], 'N');
+  CHECK_EQ(buffer[2], 'U');
+  CHECK_EQ(buffer[3], 'M');
+  CHECK_EQ(buffer[4], 'P');
+  CHECK_EQ(buffer[5], 'Y');
+  uint8_t major_version = buffer[6];
+  CHECK(major_version == 0x01 || major_version == 0x02) << "Unsupported npy 
major version";
+  CHECK(buffer[7] == 0x00) << "Unsupported npy minor version";
+  uint32_t header_len = 0;
+  header_len += buffer[8];
+  header_len += buffer[9] >> 8;
+  if (major_version == 0x02) {
+    zip_int64_t bytesread = zip_fread(file, &buffer[10], 2);
+    if (bytesread != 2) {
+      LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+    }
+    header_len += buffer[10] >> 16;
+    header_len += buffer[11] >> 24;
+  }
+  return header_len;
+}
+
+
+std::pair<std::vector<NDArray>, std::vector<std::string>>
+load_arrays(const std::string& zip_fname) {
+  int error;
+  zip_t* archive = zip_open(zip_fname.c_str(), ZIP_RDONLY, &error);
+  if (archive == nullptr) {
+    zip_error_t e;
+    zip_error_init_with_code(&e, error);
+    throw std::runtime_error(zip_error_strerror(&e));
+  }
+
+  // Collect the set of file-names per folder in the zip file. If the set of
+  // file names in a folder matches the scipy.sparse.save_npz pattern, the
+  // folder will be restored as single sparse ndarray.
+  std::unordered_map<std::string_view, std::set<std::string_view>> names;
+
+  zip_int64_t num_entries = zip_get_num_entries(archive, ZIP_FL_UNCHANGED);
+  for (zip_uint64_t i = 0; i < num_entries; i++) {
+    std::string_view entry_name = zip_get_name(archive, i, ZIP_FL_ENC_STRICT);
+    if (entry_name.substr(entry_name.size() - 4).compare(".npy") != 0) 
continue;  // only .npy
+
+    auto dir_sep_search = entry_name.rfind("/");
+    if (dir_sep_search == std::string::npos) {  // top level file
+      [[maybe_unused]] auto[iter, inserted] = names[""].insert(entry_name);
+      CHECK(inserted);
+    } else {  // file inside a folder
+      std::string_view dirname = entry_name.substr(0, dir_sep_search + 1);
+      std::string_view fname = entry_name.substr(dir_sep_search + 1);
+      [[maybe_unused]] auto[iter, inserted] = names[dirname].insert(fname);
+      CHECK(inserted);
+    }
+  }
+
+  // Return values
+  std::vector<NDArray> arrays;
+  std::vector<std::string> return_names;
+
+  // Patterns used by SciPy to save respective sparse matrix formats to a file
+  const std::set<std::string_view> bsr_csr_csc_pattern
+    {"data.npy", "indices.npy", "indptr.npy", "format.npy", "shape.npy"};
+  const std::set<std::string_view> row_sparse_pattern  // MXNet specific 
format not part of SciPy
+    {"data.npy", "indices.npy", "format.npy", "shape.npy"};
+  const std::set<std::string_view> coo_pattern
+    {"data.npy", "row.npy", "col.npy", "format.npy", "shape.npy"};
+  const std::set<std::string_view> dia_pattern
+    {"data.npy", "offsets.npy", "format.npy", "shape.npy"};
+  for (const auto& [dirname, dircontents] : names) {
+    if (dircontents == bsr_csr_csc_pattern) {
+      // Check format
+      std::string fname(dirname);
+      fname += "format.npy";
+      zip_file_t* format_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+      if (format_file == nullptr) {
+        throw std::runtime_error(zip_strerror(archive));
+      }
+
+      // In the special case of format.npy we ignore the header as it
+      // specifies the string datatype which is unsupported by MXNet
+      uint32_t header_len = parse_npy_header_len(format_file, fname, 
zip_fname);
+      std::string header;
+      header.resize(header_len);
+      zip_int64_t bytesread = zip_fread(format_file, header.data(), 
header_len);
+      if (bytesread != header_len) {
+        LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+      }
+      // and simply look at the next 3 bytes containing the format string
+      std::string format;
+      format.resize(3);
+      bytesread = zip_fread(format_file, format.data(), 3);
+      zip_fclose(format_file);
+      if (bytesread != 3) {
+        LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+      }
+
+      if (format == "csr") {
+        // Prepare reading storage data array
+        fname = dirname;
+        fname += "data.npy";
+        zip_file_t* data_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+        if (data_file == nullptr) {
+          throw std::runtime_error(zip_strerror(archive));
+        }
+        uint32_t header_len = parse_npy_header_len(data_file, fname, 
zip_fname);
+        std::string header;
+        header.resize(header_len);
+        zip_int64_t bytesread = zip_fread(data_file, header.data(), 
header_len);
+        if (bytesread != header_len) {
+          LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+        }
+        auto[storage_type_flag, storage_fortran_order, storage_shape] = \
+          npy::parse_npy_header_descr(header);
+        if (storage_fortran_order) {
+          LOG(FATAL) << "Reading fortran order data for sparse arrays not yet 
implemented.";
+        }
+        TShape storage_tshape(storage_shape);
+
+        // Prepare reading indptr aux array
+        fname = dirname;
+        fname += "indptr.npy";
+        zip_file_t* indptr_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+        if (indptr_file == nullptr) {
+          throw std::runtime_error(zip_strerror(archive));
+        }
+        header_len = parse_npy_header_len(indptr_file, fname, zip_fname);
+        header.resize(header_len);
+        bytesread = zip_fread(indptr_file, header.data(), header_len);
+        if (bytesread != header_len) {
+          LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+        }
+        auto[indptr_type_flag, indptr_fortran_order, indptr_shape] = \
+          npy::parse_npy_header_descr(header);
+        if (indptr_fortran_order) {
+          LOG(FATAL) << "Reading fortran order data for sparse arrays not yet 
implemented.";
+        }
+        TShape indptr_tshape(indptr_shape);
+
+        // Prepare reading indices aux array
+        fname = dirname;
+        fname += "indices.npy";
+        zip_file_t* indices_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+        if (indices_file == nullptr) {
+          throw std::runtime_error(zip_strerror(archive));
+        }
+        header_len = parse_npy_header_len(indices_file, fname, zip_fname);
+        header.resize(header_len);
+        bytesread = zip_fread(indices_file, header.data(), header_len);
+        if (bytesread != header_len) {
+          LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+        }
+        auto[indices_type_flag, indices_fortran_order, indices_shape] = \
+          npy::parse_npy_header_descr(header);
+        if (indices_fortran_order) {
+          LOG(FATAL) << "Reading fortran order data for sparse arrays not yet 
implemented.";
+        }
+        TShape indices_tshape(indices_shape);
+
+        // Read shape data array
+        fname = dirname;
+        fname += "shape.npy";
+        zip_file_t* shape_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+        if (shape_file == nullptr) {
+          throw std::runtime_error(zip_strerror(archive));
+        }
+        header_len = parse_npy_header_len(shape_file, fname, zip_fname);
+        header.resize(header_len);
+        bytesread = zip_fread(shape_file, header.data(), header_len);
+        if (bytesread != header_len) {
+          LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+        }
+        auto[shape_type_flag, shape_fortran_order, shape_shape] = \
+          npy::parse_npy_header_descr(header);
+        if (shape_fortran_order) {
+          LOG(FATAL) << "Reading fortran order data for sparse arrays not yet 
implemented.";
+        }
+        CHECK_EQ(shape_shape.size(), 1) << "Expected one-dimensional shape of 
shape information.";
+        TShape tshape(shape_shape.at(0), -1);
+        if (shape_type_flag == mshadow::kInt64) {  // Used in most SciPy builds
+          for (dim_t i = 0; i < shape_shape.at(0); i++) {
+            int64_t dim;
+            bytesread = zip_fread(shape_file, &dim, 8);
+            if (bytesread != 8) {
+              LOG(FATAL) << "Failed to read from " << fname << " member of " 
<< zip_fname;
+            }
+            tshape[i] = dim;
+          }
+        } else if (shape_type_flag == mshadow::kInt32) {  // Used in SciPy pip 
wheels on Windows
+          for (dim_t i = 0; i < shape_shape.at(0); i++) {
+            int32_t dim;
+            bytesread = zip_fread(shape_file, &dim, 4);
+            if (bytesread != 4) {
+              LOG(FATAL) << "Failed to read from " << fname << " member of " 
<< zip_fname;
+            }
+            tshape[i] = dim;
+          }
+        } else {
+          LOG(FATAL) << "Expected shape information in int64 or int32 format.";
+        }
+        zip_fclose(shape_file);
+
+        // Construct aux datastructures
+        static_assert(csr::CSRAuxType::kIndPtr == 0);
+        static_assert(csr::CSRAuxType::kIdx == 1);
+        const std::vector<int> aux_types {indptr_type_flag, indices_type_flag};
+        const mxnet::ShapeVector aux_shapes {indptr_tshape, indices_tshape};
+
+        // Allocate NDArray
+        NDArray array(NDArrayStorageType::kCSRStorage, tshape, Context::CPU(), 
false,
+                      storage_type_flag, aux_types, aux_shapes, 
storage_tshape);
+
+        // Read data array
+        const TBlob& blob = array.data();
+        zip_uint64_t nbytes = blob.Size() * 
mshadow::mshadow_sizeof(blob.type_flag_);
+        bytesread = zip_fread(data_file, blob.dptr_, nbytes);
+        zip_fclose(data_file);
+        if (bytesread != nbytes) {
+          LOG(FATAL) << "Failed to read from data.npy member of " << zip_fname;
+        }
+
+        // Read indptr array
+        const TBlob& indptr_blob = array.aux_data(csr::CSRAuxType::kIndPtr);
+        nbytes = indptr_blob.Size() * 
mshadow::mshadow_sizeof(indptr_blob.type_flag_);
+        bytesread = zip_fread(indptr_file, indptr_blob.dptr_, nbytes);
+        zip_fclose(indptr_file);
+        if (bytesread != nbytes) {
+          LOG(FATAL) << "Failed to read from indptr.npy member of " << 
zip_fname;
+        }
+
+        // Read indices array
+        const TBlob& indices_blob = array.aux_data(csr::CSRAuxType::kIdx);
+        nbytes = indices_blob.Size() * 
mshadow::mshadow_sizeof(indices_blob.type_flag_);
+        bytesread = zip_fread(indices_file, indices_blob.dptr_, nbytes);
+        zip_fclose(indices_file);
+        if (bytesread != nbytes) {
+          LOG(FATAL) << "Failed to read from indices.npy member of " << 
zip_fname;
+        }
+
+        arrays.push_back(array);
+        return_names.emplace_back(dirname.size() ?   // Exclude "/"
+                                  dirname.substr(0, dirname.size() - 1) : 
dirname);
+
+      } else {
+        throw std::runtime_error("Loading " + format + " sparse matrix format 
is unsupported.");
+      }
+    } else if (dircontents == row_sparse_pattern) {
+      // Check format
+      std::string fname(dirname);
+      fname += "format.npy";
+      zip_file_t* format_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+      if (format_file == nullptr) {
+        throw std::runtime_error(zip_strerror(archive));
+      }
+
+      // In the special case of format.npy we ignore the header as it
+      // specifies the string datatype which is unsupported by MXNet
+      uint32_t header_len = parse_npy_header_len(format_file, fname, 
zip_fname);
+      std::string header;
+      header.resize(header_len);
+      zip_int64_t bytesread = zip_fread(format_file, header.data(), 
header_len);
+      if (bytesread != header_len) {
+        LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+      }
+      // and simply look at the next 10 bytes containing the format string
+      std::string format;
+      format.resize(10);
+      bytesread = zip_fread(format_file, format.data(), 10);
+      zip_fclose(format_file);
+
+      if (format == "row_sparse") {
+        // Prepare reading storage data array
+        fname = dirname;
+        fname += "data.npy";
+        zip_file_t* data_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+        if (data_file == nullptr) {
+          throw std::runtime_error(zip_strerror(archive));
+        }
+        uint32_t header_len = parse_npy_header_len(data_file, fname, 
zip_fname);
+        std::string header;
+        header.resize(header_len);
+        zip_int64_t bytesread = zip_fread(data_file, header.data(), 
header_len);
+        if (bytesread != header_len) {
+          LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+        }
+        auto[storage_type_flag, storage_fortran_order, storage_shape] = \
+          npy::parse_npy_header_descr(header);
+        if (storage_fortran_order) {
+          LOG(FATAL) << "Reading fortran order data for sparse arrays not yet 
implemented.";
+        }
+        TShape storage_tshape(storage_shape);
+
+        // Prepare reading indices aux array
+        fname = dirname;
+        fname += "indices.npy";
+        zip_file_t* indices_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+        if (indices_file == nullptr) {
+          throw std::runtime_error(zip_strerror(archive));
+        }
+        header_len = parse_npy_header_len(indices_file, fname, zip_fname);
+        header.resize(header_len);
+        bytesread = zip_fread(indices_file, header.data(), header_len);
+        if (bytesread != header_len) {
+          LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+        }
+        auto[indices_type_flag, indices_fortran_order, indices_shape] = \
+          npy::parse_npy_header_descr(header);
+        if (indices_fortran_order) {
+          LOG(FATAL) << "Reading fortran order data for sparse arrays not yet 
implemented.";
+        }
+        TShape indices_tshape(indices_shape);
+
+        // Read shape data array
+        fname = dirname;
+        fname += "shape.npy";
+        zip_file_t* shape_file = zip_fopen(archive, fname.data(), 
ZIP_FL_UNCHANGED);
+        if (shape_file == nullptr) {
+          throw std::runtime_error(zip_strerror(archive));
+        }
+        header_len = parse_npy_header_len(shape_file, fname, zip_fname);
+        header.resize(header_len);
+        bytesread = zip_fread(shape_file, header.data(), header_len);
+        if (bytesread != header_len) {
+          LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+        }
+        auto[shape_type_flag, shape_fortran_order, shape_shape] = \
+          npy::parse_npy_header_descr(header);
+        if (shape_fortran_order) {
+          LOG(FATAL) << "Reading fortran order data for sparse arrays not yet 
implemented.";
+        }
+        CHECK_EQ(shape_type_flag, mshadow::kInt64) << "Expected shape 
information in int64 format.";
+        CHECK_EQ(shape_shape.size(), 1) << "Expected one-dimensional shape of 
shape information.";
+        TShape tshape(shape_shape.at(0), -1);
+        for (dim_t i = 0; i < shape_shape.at(0); i++) {
+          int64_t dim;
+          bytesread = zip_fread(shape_file, &dim, 8);
+          if (bytesread != 8) {
+            LOG(FATAL) << "Failed to read from " << fname << " member of " << 
zip_fname;
+          }
+          tshape[i] = dim;
+        }
+        zip_fclose(shape_file);
+
+        // Construct aux datastructures
+        static_assert(rowsparse::RowSparseAuxType::kIdx == 0);
+        const std::vector<int> aux_types {indices_type_flag};
+        const mxnet::ShapeVector aux_shapes {indices_tshape};
+
+        // Allocate NDArray
+        NDArray array(NDArrayStorageType::kRowSparseStorage, tshape, 
Context::CPU(), false,
+                      storage_type_flag, aux_types, aux_shapes, 
storage_tshape);
+
+        // Read data array
+        const TBlob& blob = array.data();
+        zip_uint64_t nbytes = blob.Size() * 
mshadow::mshadow_sizeof(blob.type_flag_);
+        bytesread = zip_fread(data_file, blob.dptr_, nbytes);
+        zip_fclose(data_file);
+        if (bytesread != nbytes) {
+          LOG(FATAL) << "Failed to read from data.npy member of " << zip_fname;
+        }
+
+        // Read indices array
+        const TBlob& indices_blob = 
array.aux_data(rowsparse::RowSparseAuxType::kIdx);
+        nbytes = indices_blob.Size() * 
mshadow::mshadow_sizeof(indices_blob.type_flag_);
+        bytesread = zip_fread(indices_file, indices_blob.dptr_, nbytes);
+        zip_fclose(indices_file);
+        if (bytesread != nbytes) {
+          LOG(FATAL) << "Failed to read from indices.npy member of " << 
zip_fname;
+        }
+
+        arrays.push_back(array);
+        return_names.emplace_back(dirname.size() ?   // Exclude "/"
+                                  dirname.substr(0, dirname.size() - 1) : 
dirname);
+
+      } else {
+        throw std::runtime_error("Loading " + format + " sparse matrix format 
is unsupported.");
+      }
+    } else if (dircontents == coo_pattern) {
+      throw std::runtime_error("Loading COO sparse matrix format is 
unsupported.");
+    } else if (dircontents == dia_pattern) {
+      throw std::runtime_error("Loading DIA sparse matrix format is 
unsupported.");
+    } else {  // Folder does not match scipy sparse pattern; treat containing 
files as dense
+      for (const std::string_view& fname : dircontents) {
+        std::string path(dirname);
+        path += fname;
+
+        // The string_view points to a null-terminated character array
+        // owned by zip_get_name and thus conversion to C char* is valid
+        zip_file_t* file = zip_fopen(archive, path.data(), ZIP_FL_UNCHANGED);
+        if (file == nullptr) {
+          throw std::runtime_error(zip_strerror(archive));
+        }
+
+        uint32_t header_len = parse_npy_header_len(file, path, zip_fname);
+        std::string header;
+        header.resize(header_len);
+        zip_int64_t bytesread = zip_fread(file, header.data(), header_len);
+        if (bytesread != header_len) {
+          LOG(FATAL) << "Failed to read from " << path << " member of " << 
zip_fname;
+        }
+        auto[type_flag, fortran_order, shape] = 
npy::parse_npy_header_descr(header);
+
+        if (fortran_order) {
+          fortran_order_transpose_prepare(shape);
+        }
+
+        TShape tshape(shape);
+        NDArray array(tshape, Context::CPU(), false, type_flag);
+        const TBlob& blob = array.data();
+        bytesread = zip_fread(file, blob.dptr_,
+                              blob.Size() * 
mshadow::mshadow_sizeof(blob.type_flag_));
+        zip_fclose(file);
+        if (bytesread != blob.Size() * 
mshadow::mshadow_sizeof(blob.type_flag_)) {
+          LOG(FATAL) << "Failed to read from " << path << " member of " << 
zip_fname;
+        }
+
+        if (fortran_order) {
+          array = fortran_order_transpose(shape, type_flag, array);
+        }
+
+        arrays.push_back(array);
+        return_names.emplace_back(path.substr(0, path.size() - 4));  // Skip 
.npy
+      }
+    }
+  }
+
+  zip_discard(archive);
+
+  return std::make_pair(arrays, return_names);
+}
+
+}  // namespace npz
+}  // namespace mxnet
diff --git a/src/serialization/cnpy.h b/src/serialization/cnpy.h
new file mode 100644
index 0000000..32995c6
--- /dev/null
+++ b/src/serialization/cnpy.h
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+// File is based on https://github.com/leezu/cnpy/tree/libzip released under 
MIT License
+// Copyright (C) 2011  Carl Rogers, 2018 Leonard Lausen
+
+#ifndef MXNET_SERIALIZATION_CNPY_H_
+#define MXNET_SERIALIZATION_CNPY_H_
+
+#include <mxnet/ndarray.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace mxnet {
+
+namespace npy {
+
+void save_array(const std::string& fname, const NDArray& array);
+NDArray load_array(const std::string& fname);
+
+}
+
+namespace npz {
+
+void save_array(int write_mode, const std::string& zip_fname, const 
std::string& array_name,
+                const NDArray& array);
+
+std::pair<std::vector<NDArray>, std::vector<std::string>>  load_arrays(const 
std::string& fname);
+
+}
+}  // namespace mxnet
+#endif  // MXNET_SERIALIZATION_CNPY_H_
diff --git a/tests/python/unittest/test_extensions.py 
b/tests/python/unittest/test_extensions.py
index 087fc3b..4e71881 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -179,9 +179,10 @@ def test_subgraph():
     a_data = mx.nd.ones((3,2))
     b_data = mx.nd.ones((3,2))
     sym_block3.optimize_for(a_data, b_data, backend='myProp')
-    sym_block3.export('optimized')
-    sym_block4 = nn.SymbolBlock.imports('optimized-symbol.json',['a','b'],
-                                        'optimized-0000.params')
+    sym_filename, params_filename = sym_block3.export('optimized')
+    assert sym_filename == 'optimized-symbol.json'
+    assert params_filename is None
+    sym_block4 = nn.SymbolBlock.imports(sym_filename, ['a','b'], 
params_filename)
 
     out5 = sym_block4(a_data, b_data)
     # check that result matches one executed by MXNet
diff --git a/tests/python/unittest/test_ndarray.py 
b/tests/python/unittest/test_ndarray.py
index 40383e0..f59906b 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -370,38 +370,71 @@ def test_ndarray_pickle():
         assert np.sum(a.asnumpy() != a2.asnumpy()) == 0
 
 
-def test_ndarray_saveload():
[email protected]('save_fn', [mx.nd.save, mx.npx.savez])
+def test_ndarray_saveload(save_fn):
     nrepeat = 10
-    fname = 'tmp_list.bin'
+    fname = 'tmp_list'
     for repeat in range(nrepeat):
         data = []
         # test save/load as list
         for i in range(10):
             data.append(random_ndarray(np.random.randint(1, 5)))
-        mx.nd.save(fname, data)
+        if save_fn is mx.nd.save:
+            save_fn(fname, data)
+        else:
+            save_fn(fname, *data)
         data2 = mx.nd.load(fname)
         assert len(data) == len(data2)
-        for x, y in zip(data, data2):
+        for x, y in zip(data, data2 if save_fn is mx.nd.save else 
data2.values()):
             assert np.sum(x.asnumpy() != y.asnumpy()) == 0
         # test save/load as dict
         dmap = {'ndarray xx %s' % i : x for i, x in enumerate(data)}
-        mx.nd.save(fname, dmap)
+        if save_fn is mx.nd.save:
+            save_fn(fname, dmap)
+        else:
+            save_fn(fname, **dmap)
         dmap2 = mx.nd.load(fname)
         assert len(dmap2) == len(dmap)
         for k, x in dmap.items():
             y = dmap2[k]
             assert np.sum(x.asnumpy() != y.asnumpy()) == 0
+
         # test save/load as ndarray
         # we expect the single ndarray to be converted into a list containing 
the ndarray
         single_ndarray = data[0]
-        mx.nd.save(fname, single_ndarray)
+        save_fn(fname, single_ndarray)
+
+        # Test loading with numpy
+        if save_fn is mx.npx.savez:
+            with np.load(fname) as fname_np_loaded:
+                single_ndarray_loaded = fname_np_loaded['arr_0']
+            assert np.sum(single_ndarray.asnumpy() != single_ndarray_loaded) 
== 0
+
+            mx.npx.save(fname, single_ndarray)
+            single_ndarray_loaded = np.load(fname)
+            assert np.sum(single_ndarray.asnumpy() != single_ndarray_loaded) 
== 0
+
+        # Test loading with mxnet backend
         single_ndarray_loaded = mx.nd.load(fname)
         assert len(single_ndarray_loaded) == 1
         single_ndarray_loaded = single_ndarray_loaded[0]
         assert np.sum(single_ndarray.asnumpy() != 
single_ndarray_loaded.asnumpy()) == 0
+
     os.remove(fname)
 
 
[email protected]_np
+def test_ndarray_load_fortran_order(tmp_path):
+    arr = np.arange(20).reshape((2, 10)).T
+    assert np.isfortran(arr)
+    np.save(tmp_path / 'fortran_order.npy', arr)
+
+    mx_arr = mx.npx.load(str(tmp_path / 'fortran_order.npy'))
+    np_mx_arr = mx_arr.asnumpy()
+    assert not np.isfortran(np_mx_arr)
+    assert np.sum(np_mx_arr != arr) == 0
+
+
 def test_ndarray_legacy_load():
     data = []
     for i in range(6):
diff --git a/tests/python/unittest/test_numpy_ndarray.py 
b/tests/python/unittest/test_numpy_ndarray.py
index 4ce4f75..c9ccafd 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -1001,30 +1001,40 @@ def test_np_ndarray_indexing():
 
 @use_np
 @pytest.mark.serial
-def test_np_save_load_ndarrays():
[email protected]('load_fn', [_np.load, npx.load])
+def test_np_save_load_ndarrays(load_fn):
     shapes = [(2, 0, 1), (0,), (), (), (0, 4), (), (3, 0, 0, 0), (2, 1), (0, 
5, 0), (4, 5, 6), (0, 0, 0)]
     array_list = [_np.random.randint(0, 10, size=shape) for shape in shapes]
     array_list = [np.array(arr, dtype=arr.dtype) for arr in array_list]
-    # test save/load single ndarray
+    # test save/load single ndarray to npy format
     for i, arr in enumerate(array_list):
         with TemporaryDirectory() as work_dir:
             fname = os.path.join(work_dir, 'dataset.npy')
             npx.save(fname, arr)
-            arr_loaded = npx.load(fname)
-            assert isinstance(arr_loaded, list)
-            assert len(arr_loaded) == 1
-            assert _np.array_equal(arr_loaded[0].asnumpy(), 
array_list[i].asnumpy())
+            arr_loaded = load_fn(fname)
+            assert _np.array_equal(arr_loaded.asnumpy() if load_fn is npx.load
+                                   else arr_loaded, array_list[i].asnumpy())
 
     # test save/load a list of ndarrays
     with TemporaryDirectory() as work_dir:
         fname = os.path.join(work_dir, 'dataset.npy')
-        npx.save(fname, array_list)
-        array_list_loaded = mx.nd.load(fname)
-        assert isinstance(arr_loaded, list)
+        npx.savez(fname, *array_list)
+        if load_fn is _np.load:
+            with load_fn(fname) as array_dict_loaded:  # Ensure NPZFile is 
closed
+                array_list_loaded = [
+                    array_dict_loaded['arr_{}'.format(str(i))]
+                    for i in range(len(array_dict_loaded))
+                ]
+        else:
+            array_dict_loaded = load_fn(fname)
+            array_list_loaded = [
+                array_dict_loaded['arr_{}'.format(str(i))]
+                for i in range(len(array_dict_loaded))
+            ]
         assert len(array_list) == len(array_list_loaded)
         assert all(isinstance(arr, np.ndarray) for arr in arr_loaded)
         for a1, a2 in zip(array_list, array_list_loaded):
-            assert _np.array_equal(a1.asnumpy(), a2.asnumpy())
+            assert _np.array_equal(a1.asnumpy(), a2.asnumpy() if load_fn is 
npx.load else a2)
 
     # test save/load a dict of str->ndarray
     arr_dict = {}
@@ -1033,13 +1043,21 @@ def test_np_save_load_ndarrays():
         arr_dict[k] = v
     with TemporaryDirectory() as work_dir:
         fname = os.path.join(work_dir, 'dataset.npy')
-        npx.save(fname, arr_dict)
-        arr_dict_loaded = npx.load(fname)
-        assert isinstance(arr_dict_loaded, dict)
-        assert len(arr_dict_loaded) == len(arr_dict)
-        for k, v in arr_dict_loaded.items():
-            assert k in arr_dict
-            assert _np.array_equal(v.asnumpy(), arr_dict[k].asnumpy())
+        npx.savez(fname, **arr_dict)
+        if load_fn is _np.load:
+            with load_fn(fname) as arr_dict_loaded:  # Ensure NPZFile is closed
+                assert isinstance(arr_dict_loaded, _np.lib.npyio.NpzFile)
+                assert len(arr_dict_loaded) == len(arr_dict)
+                for k, v in arr_dict_loaded.items():
+                    assert k in arr_dict
+                    assert _np.array_equal(v.asnumpy() if load_fn is npx.load 
else v, arr_dict[k].asnumpy())
+        else:
+            arr_dict_loaded = load_fn(fname)
+            assert isinstance(arr_dict_loaded, dict)
+            assert len(arr_dict_loaded) == len(arr_dict)
+            for k, v in arr_dict_loaded.items():
+                assert k in arr_dict
+                assert _np.array_equal(v.asnumpy() if load_fn is npx.load else 
v, arr_dict[k].asnumpy())
 
 
 @retry(5)
diff --git a/tests/python/unittest/test_sparse_ndarray.py 
b/tests/python/unittest/test_sparse_ndarray.py
index 1f4ac20..728bdbd 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -25,9 +25,13 @@ from mxnet.base import mx_real_t
 from numpy.testing import assert_allclose
 import numpy.random as rnd
 import numpy as np
+import scipy.sparse as spsp
+
 from common import assertRaises, xfail_when_nonstandard_decimal_separator
 from mxnet.ndarray.sparse import RowSparseNDArray, CSRNDArray
 
+import pytest
+
 
 def sparse_nd_ones(shape, stype):
     return mx.nd.ones(shape).tostype(stype)
@@ -535,40 +539,63 @@ def test_sparse_nd_pickle():
             assert same(a.asnumpy(), b.asnumpy())
 
 
-# @kalyc: Getting rid of fixed seed as flakiness could not be reproduced
-# tracked at https://github.com/apache/incubator-mxnet/issues/11741
-def test_sparse_nd_save_load():
-    repeat = 1
[email protected]('save_fn', [mx.nd.save, mx.npx.savez])
+def test_sparse_nd_save_load(save_fn):
     stypes = ['default', 'row_sparse', 'csr']
     stype_dict = {'default': NDArray, 'row_sparse': RowSparseNDArray, 'csr': 
CSRNDArray}
     num_data = 20
     densities = [0, 0.5]
-    fname = 'tmp_list.bin'
-    for _ in range(repeat):
-        data_list1 = []
-        for i in range(num_data):
-            stype = stypes[np.random.randint(0, len(stypes))]
-            shape = rand_shape_2d(dim0=40, dim1=40)
-            density = densities[np.random.randint(0, len(densities))]
-            data_list1.append(rand_ndarray(shape, stype, density))
-            assert isinstance(data_list1[-1], stype_dict[stype])
-        mx.nd.save(fname, data_list1)
-
-        data_list2 = mx.nd.load(fname)
-        assert len(data_list1) == len(data_list2)
-        for x, y in zip(data_list1, data_list2):
-            assert same(x.asnumpy(), y.asnumpy())
-
-        data_map1 = {'ndarray xx %s' % i: x for i, x in enumerate(data_list1)}
-        mx.nd.save(fname, data_map1)
-        data_map2 = mx.nd.load(fname)
-        assert len(data_map1) == len(data_map2)
-        for k, x in data_map1.items():
-            y = data_map2[k]
-            assert same(x.asnumpy(), y.asnumpy())
+    fname = 'tmp_list.npz'
+    data_list1 = []
+    for i in range(num_data):
+        stype = stypes[np.random.randint(0, len(stypes))]
+        shape = rand_shape_2d(dim0=40, dim1=40)
+        density = densities[np.random.randint(0, len(densities))]
+        data_list1.append(rand_ndarray(shape, stype, density))
+        assert isinstance(data_list1[-1], stype_dict[stype])
+    if save_fn is mx.nd.save:
+        save_fn(fname, data_list1)
+    else:
+        save_fn(fname, *data_list1)
+
+    data_list2 = mx.nd.load(fname)
+    if save_fn is mx.npx.savez:
+        data_list2 = [data_list2['arr_' + str(i)] for i in range(num_data)]
+    assert len(data_list1) == len(data_list2)
+    for x, y in zip(data_list1, data_list2):
+        assert same(x.asnumpy(), y.asnumpy())
+
+    data_map1 = {'ndarray xx %s' % i: x for i, x in enumerate(data_list1)}
+    if save_fn is mx.nd.save:
+        save_fn(fname, data_map1)
+    else:
+        save_fn(fname, **data_map1)
+    data_map2 = mx.nd.load(fname)
+    assert len(data_map1) == len(data_map2)
+    for k, x in data_map1.items():
+        y = data_map2[k]
+        assert same(x.asnumpy(), y.asnumpy())
     os.remove(fname)
 
 
[email protected]('save_fn', [mx.nd.save, mx.npx.savez])
+def test_sparse_ndarray_load_csr_npz_scipy(tmp_path, save_fn):
+    csr_sp = spsp.rand(50, 100, density=0.5, format="csr")
+    spsp.save_npz(tmp_path / "csr.npz", csr_sp)
+
+    csr_mx = mx.nd.load(str(tmp_path / "csr.npz"))['']
+    assert np.sum(csr_mx.data.asnumpy() != csr_sp.data) == 0
+    assert np.sum(csr_mx.indices.asnumpy() != csr_sp.indices) == 0
+    assert np.sum(csr_mx.indptr.asnumpy() != csr_sp.indptr) == 0
+
+    csr_mx = save_fn(str(tmp_path / "csr_mx.npz"), csr_mx)
+    csr_mx_loaded = mx.nd.load(str(tmp_path / "csr_mx.npz"))
+    csr_mx_loaded = csr_mx_loaded[0] if save_fn is mx.nd.save else 
csr_mx_loaded['arr_0']
+    assert np.sum(csr_mx_loaded.data.asnumpy() != csr_sp.data) == 0
+    assert np.sum(csr_mx_loaded.indices.asnumpy() != csr_sp.indices) == 0
+    assert np.sum(csr_mx_loaded.indptr.asnumpy() != csr_sp.indptr) == 0
+
+
 def test_sparse_nd_unsupported():
     nd = mx.nd.zeros((2,2), stype='row_sparse')
     fn_slice = lambda x: x._slice(None, None)
@@ -629,24 +656,20 @@ def test_create_csr():
             assert_almost_equal(sp_csr.indices, sp.indices)
             assert(sp.dtype == sp_csr.dtype), (sp.dtype, sp_csr.dtype)
 
-        try:
-            import scipy.sparse as spsp
-            # random canonical csr
-            csr_sp = spsp.rand(shape[0], shape[1], density, format="csr")
-            csr_nd = f(csr_sp)
-            assert_csr_almost_equal(csr_nd, csr_sp)
-            # non-canonical csr which contains duplicates and unsorted indices
-            indptr = np.array([0, 2, 3, 7])
-            indices = np.array([0, 2, 2, 0, 1, 2, 1])
-            data = np.array([1, 2, 3, 4, 5, 6, 1])
-            non_canonical_csr = spsp.csr_matrix((data, indices, indptr), 
shape=(3, 3), dtype=csr_nd.dtype)
-            canonical_csr_nd = f(non_canonical_csr, dtype=csr_nd.dtype)
-            canonical_csr_sp = non_canonical_csr.copy()
-            canonical_csr_sp.sum_duplicates()
-            canonical_csr_sp.sort_indices()
-            assert_csr_almost_equal(canonical_csr_nd, canonical_csr_sp)
-        except ImportError:
-            print("Could not import scipy.sparse. Skipping unit tests for 
scipy csr creation")
+        # random canonical csr
+        csr_sp = spsp.rand(shape[0], shape[1], density, format="csr")
+        csr_nd = f(csr_sp)
+        assert_csr_almost_equal(csr_nd, csr_sp)
+        # non-canonical csr which contains duplicates and unsorted indices
+        indptr = np.array([0, 2, 3, 7])
+        indices = np.array([0, 2, 2, 0, 1, 2, 1])
+        data = np.array([1, 2, 3, 4, 5, 6, 1])
+        non_canonical_csr = spsp.csr_matrix((data, indices, indptr), shape=(3, 
3), dtype=csr_nd.dtype)
+        canonical_csr_nd = f(non_canonical_csr, dtype=csr_nd.dtype)
+        canonical_csr_sp = non_canonical_csr.copy()
+        canonical_csr_sp.sum_duplicates()
+        canonical_csr_sp.sort_indices()
+        assert_csr_almost_equal(canonical_csr_nd, canonical_csr_sp)
 
     dim0 = 20
     dim1 = 20
@@ -771,12 +794,8 @@ def test_create_sparse_nd_from_sparse():
     ones = mx.nd.ones(shape, dtype=src_dtype)
     csr_arrs = [ones.tostype('csr')]
     rsp_arrs = [ones.tostype('row_sparse')]
-    try:
-        import scipy.sparse as spsp
-        csr_sp = spsp.csr_matrix(np.ones(shape, dtype=src_dtype))
-        csr_arrs.append(csr_sp)
-    except ImportError:
-        print("Could not import scipy.sparse. Skipping unit tests for scipy 
csr creation")
+    csr_sp = spsp.csr_matrix(np.ones(shape, dtype=src_dtype))
+    csr_arrs.append(csr_sp)
     f_csr = mx.nd.sparse.csr_matrix
     f_rsp = mx.nd.sparse.row_sparse_array
     for sp_arr in csr_arrs:
diff --git a/tests/python/unittest/test_subgraph_op.py 
b/tests/python/unittest/test_subgraph_op.py
index c546fd3..11657ed 100644
--- a/tests/python/unittest/test_subgraph_op.py
+++ b/tests/python/unittest/test_subgraph_op.py
@@ -412,10 +412,10 @@ def test_subgraph_exe9(sym, subgraph_backend, op_names):
     assert len(outputs1) == len(outputs2)
     for i in range(len(outputs1)):
         assert_almost_equal((outputs1[i] - outputs2[i]).abs().sum().asnumpy(), 
np.zeros(shape=(1,)))
-        
+
 @pytest.mark.parametrize('subgraph_backend', ['default', 'default_v2'])
 @pytest.mark.parametrize('sym,op_names', get_graphs())
-def test_subgraph_backend_gluon(sym, subgraph_backend, op_names, tmpdir):
+def test_subgraph_backend_gluon(sym, subgraph_backend, op_names, tmp_path):
     """Call hybridize() to partition the graph, and then compare results of 
the partitioned
     sym and the original sym. Here do an inference before hybridizing with the 
subgraph_backend
     which means we'll pass shapes/types"""
@@ -428,16 +428,13 @@ def test_subgraph_backend_gluon(sym, subgraph_backend, 
op_names, tmpdir):
     sym_block.hybridize()
     outputs1 = sym_block(*x)
 
-    _, json_path = tempfile.mkstemp(suffix='-symbol.json', dir=str(tmpdir))
-    export_path = json_path.replace('-symbol.json', '')
-    params_path = export_path + '-0000.params'
-    sym_block.export(export_path)
+    sym_filename, params_filename = sym_block.export(str(tmp_path / 
'sym-block'))
 
     # load model and partition
-    sym_block = nn.SymbolBlock.imports(json_path,sym[1], params_path,
+    sym_block = nn.SymbolBlock.imports(sym_filename, sym[1], params_filename,
                                        ctx=mx.current_context())
     check_call(_LIB.MXSetSubgraphPropertyOpNamesV2(c_str(subgraph_backend), 
mx_uint(len(op_names)),
-                                                c_str_array(op_names)))
+                                                   c_str_array(op_names)))
     sym_block.optimize_for(*x, backend=subgraph_backend)
     outputs2 = sym_block(*x)
     check_call(_LIB.MXRemoveSubgraphPropertyOpNamesV2(c_str(subgraph_backend)))

[incubator-mxnet] branch master updated: NumPy compatible serialization API (#19417)

Reply via email to