[incubator-mxnet] branch master updated: [MKLDNN] Enable signed int8 support for convolution. (#13697)

taolv Sun, 10 Feb 2019 05:19:02 -0800

This is an automated email from the ASF dual-hosted git repository.

taolv pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new 8b4a69a  [MKLDNN] Enable signed int8 support for convolution. (#13697)
8b4a69a is described below

commit 8b4a69aba8f9ed4afc0af7b6f673bfb0a1bebe36
Author: Zhennan Qin <[email protected]>
AuthorDate: Sun Feb 10 21:18:00 2019 +0800

    [MKLDNN] Enable signed int8 support for convolution. (#13697)
    
    * Enable s8s8 support for MKLDNN convolution.
    
    * Fix cpp build
    
    * Fix build.
    
    * Fix build
    
    * Remove openmp min/max reduction for windows build
    
    * Add mkldnn_OIhw4i16o4i_s8s8 support
    
    * Add all s8s8 weight format
    
    * Change ssd quantize script.
    
    * Update
    
    * Manually cast mshadow shape size to size_t
    
    * Fix merge.
    
    * Fix perl package.
    
    * Retrigger CI
    
    * Fix GPU test
    
    * Fix GPU test
    
    * Rerun CI
    
    * Rerun CI
    
    * Rerun CI
    
    * Rerun CI
    
    * Remove weight_channelwise_scale from params.
    
    * Fix
    
    * Keep API compatible.
    
    * Rerun CI
    
    * Rerun CI
    
    * Rerun CI
    
    * Rerun CI
    
    * Address comments.
    
    * fix.
    
    * Address debug build.
    
    * Add comment for next_impl
    
    * Rerun ci
    
    * Add new api MXExecutorSetMonitorCallbackEX
    
    * Add default value for monitor_all for cpp header.
    
    * Rerun CI
    
    * fix
    
    * script change for uint8.
    
    * trigger ci
    
    * trigger ci
---
 .gitignore                                         |   1 +
 cpp-package/include/mxnet-cpp/monitor.h            |   3 +-
 cpp-package/include/mxnet-cpp/monitor.hpp          |   8 +-
 example/quantization/imagenet_gen_qsym_mkldnn.py   |  45 ++--
 example/ssd/quantization.py                        |  25 +-
 include/mxnet/c_api.h                              |  10 +-
 include/mxnet/executor.h                           |   2 +-
 include/mxnet/tensor_blob.h                        |   2 +-
 perl-package/AI-MXNetCAPI/mxnet.i                  |   2 +-
 python/mxnet/contrib/quantization.py               |  43 ++--
 python/mxnet/executor.py                           |   9 +-
 python/mxnet/monitor.py                            |   9 +-
 src/c_api/c_api_executor.cc                        |  20 +-
 src/c_api/c_api_symbolic.cc                        |   4 +-
 src/executor/graph_executor.cc                     |  40 ++-
 src/executor/graph_executor.h                      |  10 +-
 src/operator/nn/mkldnn/mkldnn_base-inl.h           |  29 ++-
 src/operator/nn/mkldnn/mkldnn_base.cc              |  63 +++--
 src/operator/nn/mkldnn/mkldnn_convolution-inl.h    |  24 +-
 src/operator/nn/mkldnn/mkldnn_convolution.cc       | 186 +++++++-------
 .../quantization/mkldnn/mkldnn_dequantize-inl.h    |   4 +
 .../quantization/mkldnn/mkldnn_quantize_v2-inl.h   | 140 +++++++++++
 src/operator/quantization/quantization_utils.h     |  15 ++
 src/operator/quantization/quantize.cc              |   2 +-
 src/operator/quantization/quantize_graph_pass.cc   | 132 +++++-----
 src/operator/quantization/quantize_v2-inl.h        | 220 +++++++++++++++++
 .../quantization/{quantize.cc => quantize_v2.cc}   |  63 ++---
 src/operator/quantization/quantize_v2.cu           |  34 +++
 src/operator/quantization/requantize-inl.h         |  14 --
 src/operator/subgraph/mkldnn/mkldnn_conv.cc        | 268 +++++++++++----------
 .../subgraph/mkldnn/mkldnn_conv_property.cc        |  34 ++-
 tests/python/mkl/test_subgraph.py                  |  29 +--
 tests/python/unittest/test_operator.py             |  47 +++-
 33 files changed, 1019 insertions(+), 518 deletions(-)

diff --git a/.gitignore b/.gitignore
index d989888..705ef92 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,7 @@ __pycache__
 *.d
 cmake-build*
 data
+model
 recommonmark
 
 # R
diff --git a/cpp-package/include/mxnet-cpp/monitor.h 
b/cpp-package/include/mxnet-cpp/monitor.h
index c1494d0..76e7ce8 100644
--- a/cpp-package/include/mxnet-cpp/monitor.h
+++ b/cpp-package/include/mxnet-cpp/monitor.h
@@ -70,8 +70,9 @@ class Monitor {
   /*!
   * \brief install callback to executor. Supports installing to multiple 
executors.
   * \param exe The executor to install to.
+  * \param monitor_all If true, monitor both input and output, otherwise 
monitor output only.
   */
-  void install(Executor *exe);
+  void install(Executor *exe, bool monitor_all = false);
 
   /*!
   * \brief Start collecting stats for current batch. Call before calling 
forward.
diff --git a/cpp-package/include/mxnet-cpp/monitor.hpp 
b/cpp-package/include/mxnet-cpp/monitor.hpp
index f3584e2..4439e1b 100644
--- a/cpp-package/include/mxnet-cpp/monitor.hpp
+++ b/cpp-package/include/mxnet-cpp/monitor.hpp
@@ -43,10 +43,10 @@ inline Monitor::Monitor(int interval, std::regex pattern, 
StatFunc stat_func)
   : interval(interval), pattern(pattern), stat_func(stat_func), step(0) {
 }
 
-inline void Monitor::install(Executor *exe) {
-  MXExecutorSetMonitorCallback(exe->handle_,
-      static_cast<ExecutorMonitorCallback>(&Monitor::executor_callback),
-      this);
+inline void Monitor::install(Executor *exe, bool monitor_all) {
+  MXExecutorSetMonitorCallbackEX(exe->handle_,
+                                 
static_cast<ExecutorMonitorCallback>(&Monitor::executor_callback),
+                                 this, monitor_all);
   exes.push_back(exe);
 }
 
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py 
b/example/quantization/imagenet_gen_qsym_mkldnn.py
index 938890b..d807e7f 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -55,24 +55,24 @@ def convert_from_gluon(model_name, image_shape, 
classes=1000, logger=None):
     symnet = mx.symbol.load_json(y.tojson())
     params = net.collect_params()
     args = {}
-    auxs = {}    
+    auxs = {}
     for param in params.values():
         v = param._reduce()
         k = param.name
         if 'running' in k:
             auxs[k] = v
         else:
-            args[k] = v            
+            args[k] = v
     mod = mx.mod.Module(symbol=symnet, context=mx.cpu(),
                         label_names = ['softmax_label'])
-    mod.bind(for_training=False, 
-             data_shapes=[('data', (1,) + 
+    mod.bind(for_training=False,
+             data_shapes=[('data', (1,) +
                           tuple([int(i) for i in image_shape.split(',')]))])
     mod.set_params(arg_params=args, aux_params=auxs)
     dst_dir = os.path.join(dir_path, 'model')
     prefix = os.path.join(dir_path, 'model', model_name)
     if not os.path.isdir(dst_dir):
-        os.mkdir(dst_dir)       
+        os.mkdir(dst_dir)
     mod.save_checkpoint(prefix, 0)
     return prefix
 
@@ -104,7 +104,7 @@ if __name__ == '__main__':
                              'you can set to custom to load your pre-trained 
model.')
     parser.add_argument('--use-gluon-model', type=bool, default=False,
                         help='If enabled, will download pretrained model from 
Gluon-CV '
-                             'and convert to symbolic model ')    
+                             'and convert to symbolic model ')
     parser.add_argument('--batch-size', type=int, default=32)
     parser.add_argument('--label-name', type=str, default='softmax_label')
     parser.add_argument('--calib-dataset', type=str, 
default='data/val_256_q90.rec',
@@ -114,7 +114,7 @@ if __name__ == '__main__':
                         help='number of threads for data decoding')
     parser.add_argument('--num-calib-batches', type=int, default=10,
                         help='number of batches for calibration')
-    parser.add_argument('--exclude-first-conv', action='store_true', 
default=True,
+    parser.add_argument('--exclude-first-conv', action='store_true', 
default=False,
                         help='excluding quantizing the first conv layer since 
the'
                              ' input data may have negative value which 
doesn\'t support at moment' )
     parser.add_argument('--shuffle-dataset', action='store_true', default=True,
@@ -140,8 +140,8 @@ if __name__ == '__main__':
                              ' thresholds. This mode is expected to produce 
the best inference accuracy of all three'
                              ' kinds of quantized models if the calibration 
dataset is representative enough of the'
                              ' inference dataset.')
-    parser.add_argument('--quantized-dtype', type=str, default='uint8',
-                        choices=['int8', 'uint8'],
+    parser.add_argument('--quantized-dtype', type=str, default='auto',
+                        choices=['auto', 'int8', 'uint8'],
                         help='quantization destination data type for input 
data')
     parser.add_argument('--enable-calib-quantize', type=bool, default=True,
                         help='If enabled, the quantize op will '
@@ -198,40 +198,39 @@ if __name__ == '__main__':
     # get image shape
     image_shape = args.image_shape
 
+    calib_layer = lambda name: name.endswith('_output') or name == "data"
     exclude_first_conv = args.exclude_first_conv
+    if args.quantized_dtype == "uint8":
+        logger.info('quantized dtype is set to uint8, will exclude first 
conv.')
+        exclude_first_conv = True
     excluded_sym_names = []
     if args.model == 'imagenet1k-resnet-152':
         rgb_mean = '0,0,0'
         rgb_std = '1,1,1'
-        calib_layer = lambda name: name.endswith('_output')
-        excluded_sym_names += ['flatten0', 'fc1', 'pooling0']
+        excluded_sym_names += ['flatten0', 'fc1']
         if exclude_first_conv:
             excluded_sym_names += ['conv0']
     elif args.model == 'imagenet1k-inception-bn':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '1,1,1'
-        calib_layer = lambda name: name.endswith('_output')
         excluded_sym_names += ['flatten', 'fc1']
         if exclude_first_conv:
             excluded_sym_names += ['conv_1']
     elif args.model in ['resnet50_v1', 'resnet101_v1']:
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        calib_layer = lambda name: name.endswith('_output')
-        excluded_sym_names += ['resnetv10_dense0_fwd', 'resnetv10_pool0_fwd']
+        excluded_sym_names += ['resnetv10_dense0_fwd']
         if exclude_first_conv:
             excluded_sym_names += ['resnetv10_conv0_fwd']
     elif args.model == 'squeezenet1.0':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        calib_layer = lambda name: name.endswith('_output')
         excluded_sym_names += ['squeezenet0_flatten0_flatten0']
         if exclude_first_conv:
             excluded_sym_names += ['squeezenet0_conv0_fwd']
     elif args.model == 'mobilenet1.0':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        calib_layer = lambda name: name.endswith('_output')
         excluded_sym_names += ['mobilenet0_flatten0_flatten0',
                                'mobilenet0_dense0_fwd',
                                'mobilenet0_pool0_fwd']
@@ -240,22 +239,15 @@ if __name__ == '__main__':
     elif args.model == 'inceptionv3':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        calib_layer = lambda name: name.endswith('_output')
-        excluded_sym_names += ['inception30_dense0_fwd',
-                               'inception30_pool0_fwd']
+        excluded_sym_names += ['inception30_dense0_fwd']
         if exclude_first_conv:
             excluded_sym_names += ['inception30_conv0_fwd']
     elif args.model == 'custom':
         # add rgb mean/std of your model.
         rgb_mean = '0,0,0'
         rgb_std = '0,0,0'
-        calib_layer = lambda name: name.endswith('_output')
         # add layer names you donnot want to quantize.
-        # add conv/pool layer names that has negative inputs
-        # since Intel MKL-DNN only support uint8 quantization temporary.
-        # add all fc layer names since Intel MKL-DNN does not support 
temporary.
         excluded_sym_names += ['layers']
-        # add your first conv layer names since Intel MKL-DNN only support 
uint8 quantization temporary.
         if exclude_first_conv:
             excluded_sym_names += ['layers']
     else:
@@ -272,7 +264,7 @@ if __name__ == '__main__':
     mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': 
rgb_mean[2]}
     logger.info('rgb_std = %s' % rgb_std)
     rgb_std = [float(i) for i in rgb_std.split(',')]
-    std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]} 
   
+    std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}
     combine_mean_std = {}
     combine_mean_std.update(mean_args)
     combine_mean_std.update(std_args)
@@ -303,8 +295,7 @@ if __name__ == '__main__':
                                                         calib_mode=calib_mode, 
calib_data=data,
                                                         
num_calib_examples=num_calib_batches * batch_size,
                                                         
calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
-                                                        
label_names=(label_name,), calib_quantize_op = True,
-                                                        logger=logger)
+                                                        
label_names=(label_name,), logger=logger)
         if calib_mode == 'entropy':
             suffix = '-quantized-%dbatches-entropy' % num_calib_batches
         elif calib_mode == 'naive':
diff --git a/example/ssd/quantization.py b/example/ssd/quantization.py
index 4e6e739..4b111df 100644
--- a/example/ssd/quantization.py
+++ b/example/ssd/quantization.py
@@ -51,7 +51,7 @@ if __name__ == '__main__':
     parser.add_argument('--batch-size', type=int, default=32)
     parser.add_argument('--num-calib-batches', type=int, default=5,
                         help='number of batches for calibration')
-    parser.add_argument('--exclude-first-conv', action='store_true', 
default=True,
+    parser.add_argument('--exclude-first-conv', action='store_true', 
default=False,
                         help='excluding quantizing the first conv layer since 
the'
                              ' number of channels is usually not a multiple of 
4 in that layer'
                              ' which does not satisfy the requirement of 
cuDNN')
@@ -78,8 +78,8 @@ if __name__ == '__main__':
                              ' thresholds. This mode is expected to produce 
the best inference accuracy of all three'
                              ' kinds of quantized models if the calibration 
dataset is representative enough of the'
                              ' inference dataset.')
-    parser.add_argument('--quantized-dtype', type=str, default='uint8',
-                        choices=['int8', 'uint8'],
+    parser.add_argument('--quantized-dtype', type=str, default='auto',
+                        choices=['auto', 'int8', 'uint8'],
                         help='quantization destination data type for input 
data')
 
     args = parser.parse_args()
@@ -115,18 +115,19 @@ if __name__ == '__main__':
     # get image shape
     image_shape = '3,300,300'
 
+    def calib_layer(name): return not (name.endswith('_data') or
+                                       name.endswith('_weight') or
+                                       name.endswith('_bias') or
+                                       name.endswith('_workspace'))
     # Quantization layer configs
     exclude_first_conv = args.exclude_first_conv
     excluded_sym_names = []
     rgb_mean = '123,117,104'
     for i in range(1,19):
         excluded_sym_names += ['flatten'+str(i)]
-    excluded_sym_names += ['relu4_3_cls_pred_conv',
-                            'relu7_cls_pred_conv',
-                            'relu4_3_loc_pred_conv',
-                            'multibox_loc_pred',
-                            'concat0',
-                            'concat1']
+    excluded_sym_names += ['multibox_loc_pred',
+                           'concat0',
+                           'concat1']
     if exclude_first_conv:
         excluded_sym_names += ['conv1_1']
 
@@ -158,10 +159,8 @@ if __name__ == '__main__':
                                                         ctx=ctx, 
excluded_sym_names=excluded_sym_names,
                                                         calib_mode=calib_mode, 
calib_data=eval_iter,
                                                         
num_calib_examples=num_calib_batches * batch_size,
-                                                        calib_layer=None, 
quantized_dtype=args.quantized_dtype,
-                                                        
label_names=(label_name,),
-                                                        calib_quantize_op=True,
-                                                        logger=logger)
+                                                        
calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        
label_names=(label_name,), logger=logger)
         sym_name = '%s-symbol.json' % ('./model/cqssd_vgg16_reduced_300')
         param_name = '%s-%04d.params' % ('./model/cqssd_vgg16_reduced_300', 
epoch)
     qsym = qsym.get_backend_symbol('MKLDNN_POST_QUANTIZE')
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 79f2bf5..d6e13eb 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1566,7 +1566,7 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
  * \param num_offline number of parameters that are quantized offline
  * \param offline_params array of c strings representing the names of params 
quantized offline
  * \param quantized_dtype the quantized destination type for input data.
- * \param calib_quantize whether calibrate quantize op with offline 
calibration data.
+ * \param calib_quantize **Deprecated**. quantize op will always be calibrated 
if could.
  */
 MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle, SymbolHandle 
*ret_sym_handle,
                                const mx_uint num_excluded_symbols,
@@ -1847,6 +1847,14 @@ MXNET_DLL int 
MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
 MXNET_DLL int MXExecutorSetMonitorCallback(ExecutorHandle handle,
                                            ExecutorMonitorCallback callback,
                                            void* callback_handle);
+
+/*!
+ * \brief set a call back to notify the completion of operation
+ * \param monitor_all If true, monitor both input and output, otherwise 
monitor output only.
+ */
+MXNET_DLL int MXExecutorSetMonitorCallbackEX(ExecutorHandle handle,
+                                             ExecutorMonitorCallback callback,
+                                             void *callback_handle, bool 
monitor_all);
 //--------------------------------------------
 // Part 5: IO Interface
 //--------------------------------------------
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index 0ab04b8..aec1009 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -174,7 +174,7 @@ class Executor {
   /*!
    * \brief Install a callback to notify the completion of operation.
    */
-  virtual void SetMonitorCallback(const MonitorCallback& callback) {}
+  virtual void SetMonitorCallback(const MonitorCallback& callback, bool 
monitor_all = false) {}
 };  // class executor
 }  // namespace mxnet
 #endif  // MXNET_EXECUTOR_H_
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 496e8c7..412877a 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -287,7 +287,7 @@ class TBlob {
     CHECK(Device::kDevMask == this->dev_mask())
       << "TBlob.get: device type do not match specified type";
     CHECK_EQ(this->CheckContiguous(), true) << "TBlob.get_reshape: must be 
contiguous";
-    CHECK_EQ(this->shape_.Size(), shape.Size())
+    CHECK_EQ(this->shape_.Size(), static_cast<size_t>(shape.Size()))
       << "TBlob.get_with_shape: new and old shape do not match total elements";
     return mshadow::Tensor<Device, dim, DType>(dptr<DType>(), shape,
                                                shape[dim - 1], stream);
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i 
b/perl-package/AI-MXNetCAPI/mxnet.i
index b1907f5..0e6a05e 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -1618,6 +1618,7 @@ int MXExecutorReshape(int partial_shaping,
 int MXExecutorSetMonitorCallback(ExecutorHandle handle,
                                            ExecutorMonitorCallback callback,
                                            void* callback_handle);
+
 //--------------------------------------------
 // Part 5: IO Interface
 //--------------------------------------------
@@ -2167,4 +2168,3 @@ int MXRtcCudaKernelCall(CudaKernelHandle handle, int 
dev_id, void** cuda_kernel_
                                   mx_uint grid_dim_z, mx_uint block_dim_x,
                                   mx_uint block_dim_y, mx_uint block_dim_z,
                                   mx_uint shared_mem);
-
diff --git a/python/mxnet/contrib/quantization.py 
b/python/mxnet/contrib/quantization.py
index 61ad8a3..96183bb 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -80,8 +80,7 @@ def _quantize_params(qsym, params, th_dict):
                 quantized_params[name] = ndarray.array([th_dict[output][1]])
     return quantized_params
 
-def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
-                     quantized_dtype='int8', calib_quantize_op=False):
+def _quantize_symbol(sym, excluded_symbols=None, offline_params=None, 
quantized_dtype='int8'):
     """Given a symbol object representing a neural network of data type FP32,
     quantize it into a INT8 network.
 
@@ -98,8 +97,6 @@ def _quantize_symbol(sym, excluded_symbols=None, 
offline_params=None,
         avoided.
     quantized_dtype: str
         The quantized destination type for input data.
-    calib_quantize_op : bool
-        Whether perform offline calibration for quantize op.
     """
     num_excluded_symbols = 0
     if excluded_symbols is not None:
@@ -123,7 +120,7 @@ def _quantize_symbol(sym, excluded_symbols=None, 
offline_params=None,
                                      mx_uint(num_offline),
                                      c_array(ctypes.c_char_p, offline),
                                      c_str(quantized_dtype),
-                                     ctypes.c_bool(calib_quantize_op)))
+                                     ctypes.c_bool(True)))
     return Symbol(out)
 
 
@@ -151,7 +148,6 @@ class _LayerOutputCollector(object):
         else:
             self.nd_dict[name] = [arr]
 
-
 class _LayerOutputMinMaxCollector(object):
     """Saves layer output min and max values in a dict with layer names as 
keys.
     The collected min and max values will be directly used as thresholds for 
quantization.
@@ -177,10 +173,9 @@ class _LayerOutputMinMaxCollector(object):
         else:
             self.min_max_dict[name] = (min_range, max_range)
         if self.logger is not None:
-            self.logger.info("Collecting layer %s output min_range=%f, 
max_range=%f"
+            self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
                              % (name, min_range, max_range))
 
-
 def _calibrate_quantized_sym(qsym, th_dict):
     """Given a dictionary containing the thresholds for quantizing the layers,
     set the thresholds into the quantized symbol as the params of requantize 
operators.
@@ -210,7 +205,7 @@ def _collect_layer_statistics(mod, data, collector, 
max_num_examples=None, logge
     if not isinstance(data, DataIter):
         raise ValueError('Only supports data as a type of DataIter, while 
received type %s'
                          % str(type(data)))
-    mod._exec_group.execs[0].set_monitor_callback(collector.collect)
+    mod._exec_group.execs[0].set_monitor_callback(collector.collect, 
monitor_all=True)
     num_batches = 0
     num_examples = 0
     for batch in data:
@@ -265,6 +260,9 @@ def _smooth_distribution(p, eps=0.0001):
 # pylint: disable=line-too-long
 def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
     """Given a dataset, find the optimal threshold for quantizing it.
+    The reference distribution is `q`, and the candidate distribution is `p`.
+    `q` is a truncated version of the original distribution.
+
     Ref: 
http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
     """
     if isinstance(arr, NDArray):
@@ -290,8 +288,6 @@ def _get_optimal_threshold(arr, num_bins=8001, 
num_quantized_bins=255):
     hist, hist_edges = np.histogram(arr, bins=num_bins, range=(-th, th))
     zero_bin_idx = num_bins // 2
     num_half_quantized_bins = num_quantized_bins // 2
-    assert np.allclose(hist_edges[zero_bin_idx] + hist_edges[zero_bin_idx + 1],
-                       0, rtol=1e-5, atol=1e-7)
 
     thresholds = np.zeros(num_bins // 2 + 1 - num_quantized_bins // 2)
     divergence = np.zeros_like(thresholds)
@@ -315,10 +311,10 @@ def _get_optimal_threshold(arr, num_bins=8001, 
num_quantized_bins=255):
         right_outlier_count = np.sum(hist[p_bin_idx_stop:])
         p[-1] += right_outlier_count
         # is_nonzeros[k] indicates whether hist[k] is nonzero
-        is_nonzeros = (sliced_nd_hist != 0).astype(np.int32)
+        is_nonzeros = (p != 0).astype(np.int32)
 
         # calculate how many bins should be merged to generate quantized 
distribution q
-        num_merged_bins = p.size // num_quantized_bins
+        num_merged_bins = sliced_nd_hist.size // num_quantized_bins
         # merge hist into num_quantized_bins bins
         for j in range(num_quantized_bins):
             start = j * num_merged_bins
@@ -326,17 +322,17 @@ def _get_optimal_threshold(arr, num_bins=8001, 
num_quantized_bins=255):
             quantized_bins[j] = sliced_nd_hist[start:stop].sum()
         quantized_bins[-1] += sliced_nd_hist[num_quantized_bins * 
num_merged_bins:].sum()
         # expand quantized_bins into p.size bins
-        q = np.zeros(p.size, dtype=np.float32)
+        q = np.zeros(sliced_nd_hist.size, dtype=np.float32)
         for j in range(num_quantized_bins):
             start = j * num_merged_bins
             if j == num_quantized_bins - 1:
-                stop = -1
+                stop = len(is_nonzeros)
             else:
                 stop = start + num_merged_bins
             norm = is_nonzeros[start:stop].sum()
             if norm != 0:
                 q[start:stop] = float(quantized_bins[j]) / float(norm)
-        q[sliced_nd_hist == 0] = 0
+        q[p == 0] = 0
         p = _smooth_distribution(p)
         # There is a chance that q is an invalid probability distribution.
         try:
@@ -344,7 +340,6 @@ def _get_optimal_threshold(arr, num_bins=8001, 
num_quantized_bins=255):
         except ValueError:
             divergence[i - num_half_quantized_bins] = float("inf")
         divergence[i - num_half_quantized_bins] = stats.entropy(p, q)
-        quantized_bins[:] = 0
 
     min_divergence_idx = np.argmin(divergence)
     min_divergence = divergence[min_divergence_idx]
@@ -424,7 +419,7 @@ def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, calib_mode='entropy',
                    calib_data=None, num_calib_examples=None, calib_layer=None,
-                   quantized_dtype='int8', calib_quantize_op=False, 
logger=logging):
+                   quantized_dtype='int8', logger=logging):
     """User-level API for generating a quantized model from a FP32 model w/ or 
w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please 
do not run
     inference using the quantized models on Windows for now.
@@ -476,9 +471,8 @@ def quantize_model(sym, arg_params, aux_params,
         all the layers' outputs that need requantization will be collected.
     quantized_dtype : str
         The quantized destination type for input data. Currently support 'int8'
-        and 'uint8', default value is 'int8'.
-    calib_quantize_op: bool
-        Whether calibrate quantize op with its input calibration data. The 
quantize op's input should be in calib_layer
+        , 'uint8' and 'auto'. 'auto' means automatically select output type 
according to calibration result.
+        Default value is 'int8'.
     logger : Object
         A logging object for printing information during the process of 
quantization.
 
@@ -496,13 +490,12 @@ def quantize_model(sym, arg_params, aux_params,
                          ' while received type %s' % 
str(type(excluded_sym_names)))
 
     logger.info('Quantizing symbol')
-    if quantized_dtype not in ('int8', 'uint8'):
+    if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
-                         ' expected `int8` or `uint8`' % quantized_dtype)
+                         ' expected `int8`, `uint8` or `auto`' % 
quantized_dtype)
     qsym = _quantize_symbol(sym, excluded_symbols=excluded_sym_names,
                             offline_params=list(arg_params.keys()),
-                            quantized_dtype=quantized_dtype,
-                            calib_quantize_op=calib_quantize_op)
+                            quantized_dtype=quantized_dtype)
 
     th_dict = {}
     if calib_mode is not None and calib_mode != 'none':
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index fcd5406..7bf8675 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -234,13 +234,15 @@ class Executor(object):
             ndarray,
             ctypes.c_int(is_train)))
 
-    def set_monitor_callback(self, callback):
+    def set_monitor_callback(self, callback, monitor_all=False):
         """Install callback for monitor.
 
         Parameters
         ----------
         callback : function
             Takes a string and an NDArrayHandle.
+        monitor_all : bool, default False
+            If true, monitor both input and output, otherwise monitor output 
only.
 
         Examples
         --------
@@ -251,10 +253,11 @@ class Executor(object):
         """
         cb_type = ctypes.CFUNCTYPE(None, ctypes.c_char_p, NDArrayHandle, 
ctypes.c_void_p)
         self._monitor_callback = cb_type(_monitor_callback_wrapper(callback))
-        check_call(_LIB.MXExecutorSetMonitorCallback(
+        check_call(_LIB.MXExecutorSetMonitorCallbackEX(
             self.handle,
             self._monitor_callback,
-            None))
+            None,
+            ctypes.c_int(monitor_all)))
 
     @property
     def arg_dict(self):
diff --git a/python/mxnet/monitor.py b/python/mxnet/monitor.py
index e3185a1..2e10708 100644
--- a/python/mxnet/monitor.py
+++ b/python/mxnet/monitor.py
@@ -31,7 +31,7 @@ from . import ndarray
 
 
 class Monitor(object):
-    """Monitor outputs, weights, and gradients for debugging.
+    """Monitor inputs, outputs, weights, and gradients for debugging.
 
     Parameters
     ----------
@@ -46,8 +46,10 @@ class Monitor(object):
         Only tensors with names that match `name_pattern` will be included.
         For example, '.*weight|.*output' will print all weights and outputs and
         '.*backward.*' will print all gradients.
+    monitor_all : bool, default False
+        If true, monitor both input and output, otherwise monitor output only.
     """
-    def __init__(self, interval, stat_func=None, pattern='.*', sort=False):
+    def __init__(self, interval, stat_func=None, pattern='.*', sort=False, 
monitor_all=False):
         if stat_func is None:
             def asum_stat(x):
                 """returns |x|/size(x), async execution."""
@@ -61,6 +63,7 @@ class Monitor(object):
         self.exes = []
         self.re_prog = re.compile(pattern)
         self.sort = sort
+        self.monitor_all = monitor_all
         def stat_helper(name, array):
             """wrapper for executor callback"""
             array = ctypes.cast(array, NDArrayHandle)
@@ -79,7 +82,7 @@ class Monitor(object):
         exe : mx.executor.Executor
             The Executor (returned by symbol.bind) to install to.
         """
-        exe.set_monitor_callback(self.stat_helper)
+        exe.set_monitor_callback(self.stat_helper, self.monitor_all)
         self.exes.append(exe)
 
     def tic(self):
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index e2e53c7..66566ed 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -645,8 +645,6 @@ int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
   API_END_HANDLE_ERROR(delete s);
 }
 
-
-
 int MXExecutorSetMonitorCallback(ExecutorHandle handle,
                                  ExecutorMonitorCallback callback,
                                  void* callback_handle) {
@@ -658,6 +656,22 @@ int MXExecutorSetMonitorCallback(ExecutorHandle handle,
     callback_temp(name, handle, callback_handle_temp);
   };
   Executor *exec = static_cast<Executor*>(handle);
-  exec->SetMonitorCallback(clbk);
+  exec->SetMonitorCallback(clbk, false);
+  API_END();
+}
+
+int MXExecutorSetMonitorCallbackEX(ExecutorHandle handle,
+                                   ExecutorMonitorCallback callback,
+                                   void* callback_handle,
+                                   bool monitor_all) {
+  API_BEGIN();
+  ExecutorMonitorCallback callback_temp = callback;
+  void* callback_handle_temp = callback_handle;
+  std::function<void(const char*, void*)> clbk
+  = [callback_temp, callback_handle_temp](const char *name, void* handle) {
+    callback_temp(name, handle, callback_handle_temp);
+  };
+  Executor *exec = static_cast<Executor*>(handle);
+  exec->SetMonitorCallback(clbk, monitor_all);
   API_END();
 }
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 8517c9c..32b63c1 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -668,7 +668,6 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
   g.attrs["excluded_nodes"] = 
std::make_shared<nnvm::any>(std::move(excluded_node_names));
   g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
   g.attrs["quantized_dtype"] = 
std::make_shared<nnvm::any>(std::move(quantized_type));
-  g.attrs["calib_quantize"] = std::make_shared<nnvm::any>(calib_quantize);
   g = ApplyPass(std::move(g), "QuantizeGraph");
   s->outputs = g.outputs;
   *ret_sym_handle = s;
@@ -685,10 +684,9 @@ int MXSetCalibTableToQuantizedSymbol(SymbolHandle 
qsym_handle,
   API_BEGIN();
   nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(qsym_handle);
   nnvm::Graph g = Symbol2Graph(*sym);
-  const std::string prefix = "quantized_";
   std::unordered_map<std::string, std::pair<float, float>> calib_table;
   for (size_t i = 0; i < num_layers; ++i) {
-    calib_table.emplace(prefix+layer_names[i], std::make_pair(min_ranges[i], 
max_ranges[i]));
+    calib_table.emplace(layer_names[i], std::make_pair(min_ranges[i], 
max_ranges[i]));
   }
   g.attrs["calib_table"] = std::make_shared<nnvm::any>(std::move(calib_table));
   g = ApplyPass(std::move(g), "SetCalibTableToQuantizedGraph");
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index d866ad1..8302dc1 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -101,9 +101,10 @@ void GraphExecutor::Print(std::ostream &os) const {  // 
NOLINT(*)
   os << "Total " << 11 << " TempSpace resource requested\n";
 }
 
-void GraphExecutor::SetMonitorCallback(const MonitorCallback& callback) {
+void GraphExecutor::SetMonitorCallback(const MonitorCallback& callback, bool 
monitor_all) {
   CHECK(callback) << "invalid callback";
   monitor_callback_ = callback;
+  monitor_all_ = monitor_all;
 }
 
 const std::vector<NDArray>& GraphExecutor::outputs() const {
@@ -1291,7 +1292,36 @@ void GraphExecutor::BulkInferenceOpSegs() {
   }
 }
 
-void GraphExecutor::ExecuteMonCallback(size_t nid) {
+void GraphExecutor::ExecuteMonInputCallback(size_t nid) {
+  static const auto& flist_inputs =
+      nnvm::Op::GetAttr<nnvm::FListInputNames>("FListInputNames");
+  const auto& idx = graph_.indexed_graph();
+  std::vector<std::string> input_names;
+  OpNode& opnode = op_nodes_[nid];
+  const auto& inode = idx[nid];
+  const auto& node = idx[nid].source;
+  if (flist_inputs.count(node->op())) {
+    input_names = flist_inputs[node->op()](node->attrs);
+  } else {
+    for (size_t i = 0; i < node->num_inputs(); ++i) {
+      input_names.emplace_back("input" + std::to_string(i));
+    }
+  }
+  CHECK_EQ(opnode.exec->in_array.size(), input_names.size());
+  for (size_t i = 0; i < opnode.exec->in_array.size(); ++i) {
+    if (node->inputs[i].node->is_variable()) {
+    // Monitor variable
+    NDArray *cpy = new NDArray(opnode.exec->in_array[i]);
+    std::string name = node->inputs[i].node->attrs.name;
+    this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
+    }
+    NDArray *cpy = new NDArray(opnode.exec->in_array[i]);
+    std::string name = inode.source->attrs.name + "_" + input_names[i];
+    this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
+  }
+}
+
+void GraphExecutor::ExecuteMonOutputCallback(size_t nid) {
   static const auto& flist_outputs =
       nnvm::Op::GetAttr<nnvm::FListOutputNames>("FListOutputNames");
   const auto& idx = graph_.indexed_graph();
@@ -1341,6 +1371,10 @@ void GraphExecutor::RunOps(bool is_train, size_t 
topo_start, size_t topo_end) {
     if (inode.source->is_variable()) continue;
     OpNode& opnode = op_nodes_[nid];
     if (op_nodes_[nid].skip_exec_node) continue;
+    // Monitor callbacks
+    if (monitor_callback_ && monitor_all_) {
+      ExecuteMonInputCallback(nid);
+    }
     opnode.exec->op_ctx.is_train = is_train;
     opnode.exec->op_ctx.need_grad = need_grad_;
     if (opnode.exec->exec_type() == ExecType::kCrossDeviceCopy) {
@@ -1359,7 +1393,7 @@ void GraphExecutor::RunOps(bool is_train, size_t 
topo_start, size_t topo_end) {
     }
     // Monitor callbacks
     if (monitor_callback_) {
-      ExecuteMonCallback(nid);
+      ExecuteMonOutputCallback(nid);
     }
   }
 }
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index f5f032e..c899a6f 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -68,7 +68,7 @@ class GraphExecutor : public Executor {
   const std::unordered_map<std::string, NDArray>& arg_grad_map() const 
override;
   const std::unordered_map<std::string, NDArray>& aux_state_map() const 
override;
   void Print(std::ostream &os) const override; // NOLINT(*)
-  void SetMonitorCallback(const MonitorCallback& callback) override;
+  void SetMonitorCallback(const MonitorCallback& callback, bool monitor_all = 
false) override;
   // Initialize the rest of attributes
   // after setting up arguments.
   void FinishInitGraph(nnvm::Symbol symbol, nnvm::Graph g,
@@ -209,8 +209,10 @@ class GraphExecutor : public Executor {
    *  ret.opr Can be nullptr if creation failed.
   */
   CachedSegOpr CreateCachedSegOpr(size_t topo_start, size_t topo_end);
-  // run the monitor callback for node `nid`
-  void ExecuteMonCallback(size_t nid);
+  // run the monitor callback for input of node `nid`
+  void ExecuteMonInputCallback(size_t nid);
+  // run the monitor callback for output of node `nid`
+  void ExecuteMonOutputCallback(size_t nid);
   // peform bulking and segmentation on an inference graph
   void BulkInferenceOpSegs();
   // perform bulking and segmentation on a training graph
@@ -250,6 +252,8 @@ class GraphExecutor : public Executor {
   size_t num_forward_nodes_{0};
   // monitor call back
   std::function<void(const char*, void*)> monitor_callback_{nullptr};
+  // monitor both input and output from monitor call back
+  bool monitor_all_{false};
   // whether to enable bulk execution
   bool prefer_bulk_execution_;
   // cached segment operator
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h 
b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 18ef3f3..f770c4a 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -190,6 +190,9 @@ static int GetTypeSize(int dtype) {
 }
 
 static inline size_t GetArraySize(const NDArray &arr) {
+  if (arr.IsMKLDNNData()) {
+    return arr.GetMKLDNNData()->get_primitive_desc().get_size();
+  }
   return arr.shape().Size() * GetTypeSize(arr.dtype());
 }
 
@@ -238,26 +241,25 @@ static inline size_t GetMemDescSize(const 
mkldnn::memory::desc &md) {
   return ret;
 }
 
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) {
+inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int dtype = 
-1) {
+  int ndim = arr.shape().ndim();
   mkldnn::memory::dims dims(ndim);
+  dtype = (dtype == -1) ? arr.dtype() : dtype;
   for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
-  return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()),
-                              mkldnn::memory::format::any};
-}
-
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) {
-  return GetMemDesc(arr, arr.shape().ndim());
+  return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), 
mkldnn::memory::format::any};
 }
 
 inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
-                                                 int num_groups) {
-  auto ndim = arr.shape().ndim();
-  mkldnn::memory::dims tz = mkldnn::memory::dims{0};
+                                                 int num_groups,
+                                                 bool quantized = false) {
+  int dtype = quantized ? mshadow::kInt8 : arr.dtype();
   if (num_groups == 1) {
-    return GetMemDesc(arr);
+    return GetMemDesc(arr, dtype);
   } else {
+    auto ndim = arr.shape().ndim();
     CHECK((ndim == 3) || (ndim == 4))
         << "MKL-DNN weight currectly supports 3d and 4d layout";
+    auto tz = mkldnn::memory::dims{0};
     const int N = 0, H = 2, W = 3, C = 1;
     if (ndim == 3) {
       tz = mkldnn::memory::dims{
@@ -269,8 +271,7 @@ inline static mkldnn::memory::desc GetWeightDesc(const 
NDArray &arr,
           static_cast<int>(arr.shape()[C]), static_cast<int>(arr.shape()[H]),
           static_cast<int>(arr.shape()[W])};
     }
-    return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()),
-                                mkldnn::memory::format::any};
+    return mkldnn::memory::desc{tz, get_mkldnn_type(dtype), 
mkldnn::memory::format::any};
   }
 }
 
@@ -447,6 +448,8 @@ static inline void CreateDefaultInputs(const 
std::vector<NDArray> &arrs,
   }
 }
 
+const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups);
+
 const mkldnn::memory *GetWeights(const NDArray &arr,
                                  const mkldnn::memory::primitive_desc 
&target_pd,
                                  int num_groups);
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc 
b/src/operator/nn/mkldnn/mkldnn_base.cc
index ccb9d7e..d40c406 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -229,61 +229,49 @@ void CommitOutput(const NDArray &arr, const 
mkldnn_output_t &res) {
   }
 }
 
-const mkldnn::memory *GetWeights(const NDArray &arr,
-                                 const mkldnn::memory::primitive_desc 
&target_pd,
-                                 int num_groups) {
-  const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd);
-  // If the weight array already uses the target layout, simply return it
-  // directly.
-  if (mem)
-    return mem;
-
-  mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype());
-  mkldnn::memory::dims tz = mkldnn::memory::dims{0};
-  mkldnn::memory::format format = mkldnn::memory::format::format_undef;
+const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
+  const auto type = get_mkldnn_type(arr.dtype());
+  auto tz = mkldnn::memory::dims{0};
+  auto format = mkldnn::memory::format::format_undef;
   auto engine = CpuEngine::Get()->get_engine();
   const int O = 0, I = 1, H = 2, W = 3;
   if (arr.shape().ndim() == 2) {
-    tz = mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
-                              static_cast<int>(arr.shape()[I])};
+    tz = mkldnn::memory::dims{static_cast<int>(arr.shape()[O]), 
static_cast<int>(arr.shape()[I])};
     format = mkldnn::memory::format::oi;
   } else if (arr.shape().ndim() == 3) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups,
-                                    static_cast<int>(arr.shape()[O] /
-                                                     num_groups),
+             ? mkldnn::memory::dims{num_groups, 
static_cast<int>(arr.shape()[O] / num_groups),
                                     static_cast<int>(arr.shape()[I]),
                                     static_cast<int>(arr.shape()[H])}
              : mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
                                     static_cast<int>(arr.shape()[I]),
                                     static_cast<int>(arr.shape()[H])};
-    format = num_groups > 1 ? mkldnn::memory::format::goiw
-                            : mkldnn::memory::format::oiw;
+    format = num_groups > 1 ? mkldnn::memory::format::goiw : 
mkldnn::memory::format::oiw;
   } else if (arr.shape().ndim() == 4) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups,
-                                    static_cast<int>(arr.shape()[O] /
-                                                     num_groups),
+             ? mkldnn::memory::dims{num_groups, 
static_cast<int>(arr.shape()[O] / num_groups),
                                     static_cast<int>(arr.shape()[I]),
                                     static_cast<int>(arr.shape()[H]),
                                     static_cast<int>(arr.shape()[W])}
-             : mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
-                                    static_cast<int>(arr.shape()[I]),
-                                    static_cast<int>(arr.shape()[H]),
-                                    static_cast<int>(arr.shape()[W])};
-    format = num_groups > 1 ? mkldnn::memory::format::goihw
-                            : mkldnn::memory::format::oihw;
+             : mkldnn::memory::dims{
+                   static_cast<int>(arr.shape()[O]), 
static_cast<int>(arr.shape()[I]),
+                   static_cast<int>(arr.shape()[H]), 
static_cast<int>(arr.shape()[W])};
+    format = num_groups > 1 ? mkldnn::memory::format::goihw : 
mkldnn::memory::format::oihw;
   } else {
     LOG(FATAL) << "The weight array has an unsupported number of dimensions";
-    return nullptr;
   }
-  mkldnn::memory::desc md =
-      mkldnn::memory::desc{tz, type, format};
-  mkldnn::memory::primitive_desc pd =
-      mkldnn::memory::primitive_desc{md, engine};
-  mem = arr.GetMKLDNNData(pd);
-  if (mem == nullptr)
-    mem = arr.GetMKLDNNDataReorder(target_pd);
+  const auto md = mkldnn::memory::desc{tz, type, format};
+  const auto pd = mkldnn::memory::primitive_desc{md, engine};
+  return arr.GetMKLDNNData(pd);
+}
+
+const mkldnn::memory *GetWeights(const NDArray &arr,
+                                 const mkldnn::memory::primitive_desc 
&target_pd, int num_groups) {
+  const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd);
+  // If the weight array already uses the target layout, simply return it 
directly.
+  if (mem) return mem;
+  mem = GetWeights(arr, num_groups);
+  if (mem == nullptr) mem = arr.GetMKLDNNDataReorder(target_pd);
   if (mem->get_primitive_desc() == target_pd) return mem;
 
   auto ret = TmpMemMgr::Get()->Alloc(target_pd);
@@ -350,6 +338,7 @@ mkldnn_memory_format_t GetDefaultFormat(const 
mkldnn::memory::desc &desc) {
       case mkldnn_oIhw8i:
       case mkldnn_oIhw16i:
       case mkldnn_OIhw8i8o:
+      case mkldnn_hwio_s8s8:
       case mkldnn_OIhw16i16o:
       case mkldnn_OIhw4i16o4i:
       case mkldnn_OIhw4i16o4i_s8s8:
@@ -384,9 +373,11 @@ mkldnn_memory_format_t GetDefaultFormat(const 
mkldnn::memory::desc &desc) {
     switch (desc.data.format) {
       case mkldnn_goihw:
       case mkldnn_hwigo:
+      case mkldnn_hwigo_s8s8:
       case mkldnn_gOIhw8i8o:
       case mkldnn_gOIhw16i16o:
       case mkldnn_gOIhw4i16o4i:
+      case mkldnn_gOIhw4i16o4i_s8s8:
       case mkldnn_gOIhw8i16o2i:
       case mkldnn_gOIhw8o16i2o:
       case mkldnn_gOIhw8o8i:
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h 
b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
index 971c66a..ab6650e 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -42,7 +42,6 @@ struct MKLDNNConvParam : public 
dmlc::Parameter<MKLDNNConvParam> {
   bool with_sum;
   bool with_postsum_relu;
   bool quantized;
-  bool weight_channelwise_scale;
 
   dmlc::optional<float> min_calib_range;  // min float value calculated from 
calibration dataset
   dmlc::optional<float> max_calib_range;  // max float value calculated from 
calibration dataset
@@ -58,8 +57,6 @@ struct MKLDNNConvParam : public 
dmlc::Parameter<MKLDNNConvParam> {
     .describe("Add post relu after sum");
     DMLC_DECLARE_FIELD(quantized).set_default(false)
     .describe("enable quantization");
-    DMLC_DECLARE_FIELD(weight_channelwise_scale).set_default(true)
-    .describe("Quantize weight with channel wise scales.");
     DMLC_DECLARE_FIELD(min_calib_range)
     .set_default(dmlc::optional<float>())
     .describe("The minimum scalar value in the form of float32 obtained "
@@ -85,23 +82,28 @@ static inline bool IsOutputUInt8(const MKLDNNConvParam 
&mkldnn_param) {
          mkldnn_param.with_postsum_relu;
 }
 
-mkldnn::convolution_forward::primitive_desc
-GetConvFwdImpl(const MKLDNNConvFullParam &param, const bool is_train,
-               const NDArray &data, const NDArray &weights, const NDArray 
*bias,
-               const NDArray &output);
+mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(const 
MKLDNNConvFullParam &param,
+                                                           const bool is_train,
+                                                           const NDArray &data,
+                                                           const NDArray 
&weights,
+                                                           const NDArray *bias,
+                                                           const NDArray 
&output);
 
 class MKLDNNConvForward {
  public:
   mkldnn::convolution_forward::primitive_desc fwd_pd;
 
-  MKLDNNConvForward(const MKLDNNConvFullParam &param, const bool is_train,
-                    const NDArray &data, const NDArray &weights,
-                    const NDArray *bias, const NDArray &output)
-      : fwd_pd(GetConvFwdImpl(param, is_train, data, weights, bias, output)) {}
+  MKLDNNConvForward(const MKLDNNConvFullParam &param, const bool is_train, 
const NDArray &data,
+                    const NDArray &weights, const NDArray *bias, const NDArray 
&output);
 
   void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
                  const mkldnn::memory *bias, const mkldnn::memory &output);
 
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) {
+    this->data_->set_data_handle(data.get_data_handle());
+    this->out_->set_data_handle(output.get_data_handle());
+  }
+
   const mkldnn::convolution_forward &GetFwd() const {
     return *fwd_;
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc 
b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index 7f423ce..a3aca98 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -45,15 +45,21 @@ bool SupportMKLDNNConv(const ConvolutionParam& params, 
const NDArray &input) {
           (input.shape().ndim() == 4));
 }
 
-mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
-    const MKLDNNConvFullParam &param, const bool is_train,
-    const NDArray &data, const NDArray &weights, const NDArray *bias,
-    const NDArray &output) {
+mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(const 
MKLDNNConvFullParam &param,
+                                                           const bool 
is_train, const NDArray &data,
+                                                           const NDArray 
&weights,
+                                                           const NDArray *bias,
+                                                           const NDArray 
&output) {
   auto prop = is_train ? mkldnn::prop_kind::forward_training : 
mkldnn::prop_kind::forward_scoring;
   auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group);
+  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, 
param.mkldnn_param.quantized);
   auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
+  auto bias_md =
+      bias ? (param.mkldnn_param.quantized ? GetMemDesc(*bias, 
mshadow::kInt32) : GetMemDesc(*bias))
+           : mkldnn::memory::desc{
+             {}, mkldnn::memory::data_type::data_undef, 
mkldnn::memory::format::any};
+  auto bias_md_ptr = bias ? &bias_md : nullptr;
+
   mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
   mkldnn::memory::dims padding(param.conv_param.kernel.ndim());
   if (param.conv_param.kernel.ndim() == 1) {
@@ -77,55 +83,61 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
   mkldnn::primitive_attr attr;
   mkldnn::post_ops ops;
   if (param.mkldnn_param.with_relu) {
-    float scale = 1.0f;            // for fp32, scale is 1.
-    float alpha = 0.0f;            // negative slope for mkldnn_eltwise_relu.
-    float beta = 1.0f;             // ignored for mkldnn_eltwise_relu.
+    float scale = 1.0f;  // for fp32, scale is 1.
+    float alpha = 0.0f;  // negative slope for mkldnn_eltwise_relu.
+    float beta = 1.0f;   // ignored for mkldnn_eltwise_relu.
     ops.append_eltwise(scale, eltwise_relu, alpha, beta);
   }
   if (param.mkldnn_param.with_sum) {
     ops.append_sum(param.sum_scale);
   }
   if (param.mkldnn_param.with_postsum_relu) {
-    float scale = 1.0f;            // for fp32, scale is 1.
-    float alpha = 0.0f;            // negative slope for mkldnn_eltwise_relu.
-    float beta = 1.0f;             // ignored for mkldnn_eltwise_relu.
+    float scale = 1.0f;  // for fp32, scale is 1.
+    float alpha = 0.0f;  // negative slope for mkldnn_eltwise_relu.
+    float beta = 1.0f;   // ignored for mkldnn_eltwise_relu.
     ops.append_eltwise(scale, eltwise_relu, alpha, beta);
   }
   attr.set_post_ops(ops);
 
   if (param.mkldnn_param.quantized && param.requantize_scales.size()) {
-    int mask = param.mkldnn_param.weight_channelwise_scale ? 2 : 0;
+    int mask = (param.requantize_scales.size() > 1) ? 2 : 0;
     attr.set_output_scales(mask, param.requantize_scales);
     attr.set_int_output_round_mode(round_nearest);
   }
-
-  // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // for computation compared with the actual tensor size. Currently, MKL-DNN
-  // operators are still reusing those memory from memory planning and the
-  // memory size may smaller than what MKL-DNN kernels require. So here we need
-  // select suboptimal kernel for computation according to tensor sizes.
-  if (param.conv_param.dilate.ndim() == 0 && bias == nullptr) {
-    mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, out_md, strides, padding, padding, 
mkldnn::padding_kind::zero);
-    auto conv_pd =  mkldnn::convolution_forward::primitive_desc(desc, attr, 
engine);
-    while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
-           conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-           conv_pd.weights_primitive_desc().get_size() != 
GetArraySize(weights)) {
-      CHECK(conv_pd.next_impl()) << "No implementation";
+  auto GetConvFwdPd = [&param, &data, &weights, &output,
+                       &attr](const mkldnn::convolution_forward::desc &desc) {
+    auto engine = CpuEngine::Get()->get_engine();
+    try {
+      auto conv_pd = mkldnn::convolution_forward::primitive_desc(desc, attr, 
engine);
+      while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
+             conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
+             (!param.mkldnn_param.quantized &&
+              conv_pd.weights_primitive_desc().get_size() != 
GetArraySize(weights))) {
+        // next_impl() will visit desc and engine, please make sure they are 
still alive here.
+        CHECK(conv_pd.next_impl()) << "No convolution implementation for this 
request.";
+      }
+      return conv_pd;
+    } catch (mkldnn::error &e) {
+      if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
+        LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
+                      "required for int8 convolution";
+      } else {
+        LOG(ERROR) << e.message;
+      }
+      throw;
     }
-    return conv_pd;
+  };
+
+  if (param.conv_param.dilate.ndim() == 0 && bias_md_ptr == nullptr) {
+    mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct, data_md,
+                                           weight_md, out_md, strides, 
padding, padding,
+                                           mkldnn::padding_kind::zero);
+    return GetConvFwdPd(desc);
   } else if (param.conv_param.dilate.ndim() == 0) {
-    auto bias_md = GetMemDesc(*bias);
-    mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, bias_md, out_md, strides, padding, padding,
-        mkldnn::padding_kind::zero);
-    auto conv_pd =  mkldnn::convolution_forward::primitive_desc(desc, attr, 
engine);
-    while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
-           conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-           conv_pd.weights_primitive_desc().get_size() != 
GetArraySize(weights)) {
-      CHECK(conv_pd.next_impl()) << "No implementation";
-    }
-    return conv_pd;
+    mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct, data_md,
+                                           weight_md, *bias_md_ptr, out_md, 
strides, padding,
+                                           padding, 
mkldnn::padding_kind::zero);
+    return GetConvFwdPd(desc);
   } else {
     mkldnn::memory::dims dilates(param.conv_param.kernel.ndim());
     if (param.conv_param.dilate.ndim() == 1) {
@@ -134,34 +146,19 @@ mkldnn::convolution_forward::primitive_desc 
GetConvFwdImpl(
       dilates[0] = param.conv_param.dilate[0] - 1;
       dilates[1] = param.conv_param.dilate[1] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
-                 << param.conv_param.dilate.ndim()
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << 
param.conv_param.dilate.ndim()
                  << ", supporting only 1 or 2.";
     }
-    if (bias == nullptr) {
-      mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct,
-          data_md, weight_md, out_md, strides, dilates, padding, padding,
-          mkldnn::padding_kind::zero);
-      auto conv_pd =  mkldnn::convolution_forward::primitive_desc(desc, attr, 
engine);
-      while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
-             conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-             conv_pd.weights_primitive_desc().get_size() != 
GetArraySize(weights)) {
-        CHECK(conv_pd.next_impl()) << "No implementation";
-      }
-      return conv_pd;
-    } else {
-      auto bias_md = GetMemDesc(*bias);
-      mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct,
-                                             data_md, weight_md, bias_md, 
out_md, strides,
-                                             dilates, padding, padding,
+    if (bias_md_ptr == nullptr) {
+      mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct, data_md,
+                                             weight_md, out_md, strides, 
dilates, padding, padding,
                                              mkldnn::padding_kind::zero);
-      auto conv_pd =  mkldnn::convolution_forward::primitive_desc(desc, attr, 
engine);
-      while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
-             conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-             conv_pd.weights_primitive_desc().get_size() != 
GetArraySize(weights)) {
-        CHECK(conv_pd.next_impl()) << "No implementation";
-      }
-      return conv_pd;
+      return GetConvFwdPd(desc);
+    } else {
+      mkldnn::convolution_forward::desc desc(prop, 
mkldnn::algorithm::convolution_direct, data_md,
+                                             weight_md, *bias_md_ptr, out_md, 
strides, dilates,
+                                             padding, padding, 
mkldnn::padding_kind::zero);
+      return GetConvFwdPd(desc);
     }
   }
 }
@@ -328,48 +325,31 @@ static 
mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   }
 }
 
-void MKLDNNConvForward::SetNewMem(const mkldnn::memory &data,
-                                  const mkldnn::memory &weight,
-                                  const mkldnn::memory *bias,
-                                  const mkldnn::memory &output) {
-  if (this->data_ == nullptr)
-    this->data_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-            fwd_pd.src_primitive_desc(), data.get_data_handle()));
-  else
-    this->data_->set_data_handle(data.get_data_handle());
-
-  if (this->weight_ == nullptr)
-    this->weight_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-            fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
-  else
-    this->weight_->set_data_handle(weight.get_data_handle());
-
-  if (this->out_ == nullptr)
-    this->out_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-            fwd_pd.dst_primitive_desc(), output.get_data_handle()));
-  else
-    this->out_->set_data_handle(output.get_data_handle());
-
-  if (bias != nullptr) {
-    if (this->bias_ == nullptr)
-      this->bias_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
-    else
-      this->bias_->set_data_handle(bias->get_data_handle());
-    if (this->fwd_ == nullptr)
-      this->fwd_ = std::shared_ptr<mkldnn::convolution_forward>(
-          new mkldnn::convolution_forward(fwd_pd, 
mkldnn::primitive::at(*this->data_),
-                                          
mkldnn::primitive::at(*this->weight_),
-                                          mkldnn::primitive::at(*this->bias_),
-                                          *this->out_));
-  } else if (this->fwd_ == nullptr) {
-    this->fwd_ = std::shared_ptr<mkldnn::convolution_forward>(
-        new mkldnn::convolution_forward(fwd_pd, 
mkldnn::primitive::at(*this->data_),
-                                        mkldnn::primitive::at(*this->weight_),
-                                        *this->out_));
+MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam &param, const 
bool is_train,
+                                     const NDArray &data, const NDArray 
&weights,
+                                     const NDArray *bias, const NDArray 
&output)
+    : fwd_pd(GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
+  data_ = std::make_shared<mkldnn::memory>(fwd_pd.src_primitive_desc(), 
nullptr);
+  weight_ = std::make_shared<mkldnn::memory>(fwd_pd.weights_primitive_desc(), 
nullptr);
+  out_ = std::make_shared<mkldnn::memory>(fwd_pd.dst_primitive_desc(), 
nullptr);
+  if (bias) {
+    bias_ = std::make_shared<mkldnn::memory>(fwd_pd.bias_primitive_desc(), 
nullptr);
+    fwd_ = std::make_shared<mkldnn::convolution_forward>(fwd_pd, *this->data_, 
*this->weight_,
+                                                         *this->bias_, 
*this->out_);
+  } else {
+    fwd_ = std::make_shared<mkldnn::convolution_forward>(fwd_pd, *this->data_, 
*this->weight_,
+                                                         *this->out_);
   }
 }
 
+void MKLDNNConvForward::SetNewMem(const mkldnn::memory &data, const 
mkldnn::memory &weight,
+                                  const mkldnn::memory *bias, const 
mkldnn::memory &output) {
+  data_->set_data_handle(data.get_data_handle());
+  weight_->set_data_handle(weight.get_data_handle());
+  out_->set_data_handle(output.get_data_handle());
+  if (bias != nullptr) bias_->set_data_handle(bias->get_data_handle());
+}
+
 MKLDNNConvForward &GetConvFwd(const ConvolutionParam &param,
                               const bool is_train, const NDArray &data,
                               const NDArray &weights, const NDArray *bias,
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h 
b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
index 89c3c19..b66adf7 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -74,6 +74,10 @@ static void MKLDNNDequantizeComputeKer(const 
std::vector<NDArray> &inputs,
     i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
   }
   mkldnn::memory::format i_fmt = 
static_cast<mkldnn::memory::format>(i_desc.data.format);
+  if (i_fmt == mkldnn::memory::format::nhwc) {
+    // For 4d tensor, nchw is the default format
+    i_fmt = mkldnn::memory::format::nchw;
+  }
   auto o_desc = mkldnn::memory::desc(i_dims,
                                     
(mkldnn::memory::data_type)data_type_enum<DstType>::type,
                                     i_fmt);
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h 
b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
new file mode 100644
index 0000000..e201d29
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_quantize_v2-inl.h
+ * \brief
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../quantize_v2-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template <typename SrcType, typename DstType>
+static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
+                                     const std::vector<NDArray>& outputs,
+                                     const QuantizeV2Param& param,
+                                     const std::vector<OpReqType>& req) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  SrcType real_range = 0.f;
+  DstType quantized_range = 0;
+  NDArray in_buffer = inputs[0];
+  SrcType data_min = red::limits::MaxValue<SrcType>();
+  SrcType data_max = red::limits::MinValue<SrcType>();
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    data_min = param.min_calib_range.value();
+    data_max = param.max_calib_range.value();
+  } else {
+    // no calib info
+    in_buffer = inputs[0].Reorder2Default();
+    auto in_ptr = in_buffer.data().dptr<SrcType>();
+    auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    std::vector<SrcType> data_maxs(nthreads, data_max);
+    std::vector<SrcType> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+    for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size()); 
i++) {
+      int tid = omp_get_thread_num();
+      if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+      if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+    }
+    for (index_t i = 0; i < nthreads; i++) {
+      if (data_maxs[i] > data_max) data_max = data_maxs[i];
+      if (data_mins[i] < data_min) data_min = data_mins[i];
+    }
+  }
+
+  auto out_type = GetOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    real_range = std::max<SrcType>(0.f, data_max);
+    quantized_range = MaxValue<DstType>();
+    *outputs[1].data().dptr<float>() = 0.f;
+    *outputs[2].data().dptr<float>() = real_range;
+  } else if (out_type == mshadow::kInt8) {
+    real_range = MaxAbs(data_min, data_max);
+    quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
+    *outputs[1].data().dptr<float>() = -real_range;
+    *outputs[2].data().dptr<float>() = real_range;
+  } else {
+    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output 
type";
+  }
+  float scale = static_cast<float>(quantized_range) / real_range;
+
+  primitive_attr attr;
+  const int mask = 0;
+  std::vector<float> scales = {scale};
+  attr.set_output_scales(mask, scales);
+  attr.set_int_output_round_mode(round_nearest);
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+
+  if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer = 
inputs[0].Reorder2Default();
+  auto i_mem = in_buffer.GetMKLDNNData();
+  auto i_mpd = i_mem->get_primitive_desc();
+  auto i_desc = i_mpd.desc();
+  mkldnn::memory::format i_fmt = 
static_cast<mkldnn::memory::format>(i_desc.data.format);
+  if (i_fmt == mkldnn::memory::format::nchw ||
+      i_fmt == mkldnn::memory::format::nChw8c ||
+      i_fmt == mkldnn_nChw16c) {
+    i_fmt = mkldnn::memory::format::nhwc;
+  }
+  size_t i_ndim = in_buffer.shape().ndim();
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+  for (size_t i = 0; i < i_ndim; i++) {
+    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+  }
+  auto o_desc =
+      mkldnn::memory::desc(i_dims, 
(mkldnn::memory::data_type)data_type_enum<DstType>::type, i_fmt);
+  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+  auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, 
*o_mem.second));
+  CommitOutput(outputs[0], o_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNQuantizeV2Compute(const nnvm::NodeAttrs& attrs, const 
OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  auto out_type = GetOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+  } else if (out_type == mshadow::kInt8) {
+    MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+  } else {
+    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output 
type";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
diff --git a/src/operator/quantization/quantization_utils.h 
b/src/operator/quantization/quantization_utils.h
index ee71122..efc8410 100644
--- a/src/operator/quantization/quantization_utils.h
+++ b/src/operator/quantization/quantization_utils.h
@@ -27,6 +27,7 @@
 #include <mxnet/base.h>
 #include <algorithm>
 #include "../mxnet_op.h"
+#include "../tensor/broadcast_reduce_op.h"
 
 namespace mxnet {
 namespace op {
@@ -171,6 +172,20 @@ struct QuantizationRangeForMultiplicationStruct {
   }
 };
 
+template<typename xpu, typename DType>
+inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
+                           const TShape& data_shape,
+                           const TShape& out_shape,
+                           TShape* src_shape,
+                           TShape* dst_shape) {
+  BroadcastReduceShapeCompact(data_shape, out_shape, src_shape, dst_shape);
+  constexpr int NDim = 2;
+  CHECK_EQ(src_shape->ndim(), NDim);
+  CHECK_EQ(dst_shape->ndim(), NDim);
+
+  return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo, 
*src_shape);
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZATION_UTILS_H_
diff --git a/src/operator/quantization/quantize.cc 
b/src/operator/quantization/quantize.cc
index 5227751..e486f05 100644
--- a/src/operator/quantization/quantize.cc
+++ b/src/operator/quantization/quantize.cc
@@ -71,7 +71,7 @@ where
 `scale = quantized_range / MaxAbs(min_range, max_range).`
 
 .. Note::
-    This operator only supports forward propogation. DO NOT use it in 
training.)code" ADD_FILELINE)
+    This operator only supports forward propagation. DO NOT use it in 
training.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<QuantizeParam>)
 .set_num_inputs(3)
 .set_num_outputs(3)
diff --git a/src/operator/quantization/quantize_graph_pass.cc 
b/src/operator/quantization/quantize_graph_pass.cc
index fcd0fb4..af53397 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -26,6 +26,7 @@
 #include <nnvm/pass.h>
 #include <mxnet/op_attr_types.h>
 #include <unordered_set>
+#include "quantize_v2-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -63,12 +64,12 @@ NodePtr InsertNode(std::string op_name,
 }
 
 std::vector<NodeEntry> OfflineParams(std::vector<NodeEntry>&& outputs,
-                                     std::unordered_set<std::string>&& 
offline_params) {
+                                     const std::unordered_set<std::string>& 
offline_params) {
   std::string node_suffixs[3] = {"", "_min", "_max"};
   std::unordered_map<Node*, NodePtr> mirror_map;
   nnvm::NodeEntryMap<NodePtr> entry_var;
   auto need_offline = [&](NodePtr n) {
-    return (n->op() == Op::Get("_contrib_quantize")) &&
+    return (n->op() == Op::Get("_contrib_quantize_v2")) &&
            n->inputs[0].node->is_variable() &&
            offline_params.count(n->inputs[0].node->attrs.name);
   };
@@ -88,7 +89,8 @@ std::vector<NodeEntry> OfflineParams(std::vector<NodeEntry>&& 
outputs,
   return outputs;
 }
 
-inline bool NeedQuantize(NodePtr node, const std::unordered_set<std::string>& 
excluded_nodes) {
+inline bool NeedQuantize(const NodePtr node,
+                         const std::unordered_set<std::string>& 
excluded_nodes) {
   static auto& quantized_op_map = 
Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
   static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
   const auto& op = node->op();
@@ -121,10 +123,9 @@ Graph QuantizeGraph(Graph &&src) {
   static const auto& need_requantize_map = 
Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
   static const auto& avoid_quantize_input_map =
       Op::GetAttr<mxnet::FAvoidQuantizeInput>("FAvoidQuantizeInput");
-  auto offline_params = 
src.GetAttr<std::unordered_set<std::string>>("offline_params");
-  auto excluded_nodes = 
src.GetAttr<std::unordered_set<std::string>>("excluded_nodes");
-  auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
-  auto calib_quantize = src.GetAttr<bool>("calib_quantize");
+  const auto offline_params = 
src.GetAttr<std::unordered_set<std::string>>("offline_params");
+  const auto excluded_nodes = 
src.GetAttr<std::unordered_set<std::string>>("excluded_nodes");
+  const auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
 
   // mirror_map stores the mapping from the currently visited graph to the 
newly created quantized
   // graph. Key is the currently visited graph's node pointer, and value is a 
copied node of the key
@@ -174,24 +175,10 @@ Graph QuantizeGraph(Graph &&src) {
               }
             }
 
-            NodePtr quantize_node = InsertNode("_contrib_quantize",
+            NodePtr quantize_node = InsertNode("_contrib_quantize_v2",
               e.node->attrs.name + suffix + "_quantize", new_node, 
mirror_entry);
             quantize_node->attrs.dict["out_type"] = quantized_dtype;
             quantize_node->op()->attr_parser(&(quantize_node->attrs));
-            if (calib_quantize) {
-              NodePtr min_var = CreateNode("nullptr", e.node->attrs.name + 
suffix + "_min");
-              quantize_node->inputs.emplace_back(NodeEntry{min_var, 0, 0});
-              NodePtr max_var = CreateNode("nullptr", e.node->attrs.name + 
suffix + "_max");
-              quantize_node->inputs.emplace_back(NodeEntry{max_var, 0, 0});
-            } else {
-              NodePtr min_node = InsertNode("min",
-                  e.node->attrs.name + suffix + "_min", quantize_node, 
mirror_entry);
-              min_node->op()->attr_parser(&(min_node->attrs));
-
-              NodePtr max_node = InsertNode("max",
-                  e.node->attrs.name + suffix + "_max", quantize_node, 
mirror_entry);
-              max_node->op()->attr_parser(&(max_node->attrs));
-            }
             mirror_entry_map[e] = NodeEntry{quantize_node, 0, e.version};
           }
         } else if (mirror_node->op() == Op::Get("_contrib_dequantize")) {
@@ -269,43 +256,35 @@ Graph QuantizeGraph(Graph &&src) {
       // the new_node.
       *new_node = *node;
       new_node->inputs.clear();
-      if (node->is_variable() && node->attrs.name == "data") {
-        // Insert identity for data to collect calib for it.
-        NodePtr identity_node =
-            CreateNode("identity", new_node->attrs.name + "_id");
-        identity_node->inputs.emplace_back(NodeEntry{new_node, 0, 0});
-        new_node = identity_node;
-      } else {
-        for (const auto& e : node->inputs) {
-          NodePtr mirror_node = mirror_map.at(e.node.get());
-          NodeEntry mirror_entry = NodeEntry{
-            mirror_node, e.index, e.version};
-          // if input node is quantized operator, add dequantize node
-          if (NeedQuantize(e.node, excluded_nodes) &&
-              (mirror_node->op() != Op::Get("_contrib_dequantize"))) {
-            // here we calculate the output number (exclude min/max, in order 
to
-            // calculate min/max index from mirror node) based on assumption 
that
-            // there is only 1min and 1max output from mirror node (which is
-            // currently true)
-            size_t num_outputs = mirror_node->num_outputs() - 2;
-            uint32_t min_index = num_outputs + 2 * e.index;
-            uint32_t max_index = num_outputs + 2 * e.index + 1;
-            NodePtr dequantize_node = CreateNode("_contrib_dequantize",
-              e.node->attrs.name + "_dequantize");
-            dequantize_node->inputs.emplace_back(mirror_entry);
-            dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, 
min_index, 0});
-            dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, 
max_index, 0});
-            dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
+      for (const auto& e : node->inputs) {
+        NodePtr mirror_node = mirror_map.at(e.node.get());
+        NodeEntry mirror_entry = NodeEntry{
+          mirror_node, e.index, e.version};
+        // if input node is quantized operator, add dequantize node
+        if (NeedQuantize(e.node, excluded_nodes) &&
+            (mirror_node->op() != Op::Get("_contrib_dequantize"))) {
+          // here we calculate the output number (exclude min/max, in order to
+          // calculate min/max index from mirror node) based on assumption that
+          // there is only 1min and 1max output from mirror node (which is
+          // currently true)
+          size_t num_outputs = mirror_node->num_outputs() - 2;
+          uint32_t min_index = num_outputs + 2 * e.index;
+          uint32_t max_index = num_outputs + 2 * e.index + 1;
+          NodePtr dequantize_node = CreateNode("_contrib_dequantize",
+            e.node->attrs.name + "_dequantize");
+          dequantize_node->inputs.emplace_back(mirror_entry);
+          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, 
min_index, 0});
+          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, 
max_index, 0});
+          dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
 
-            new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
-            mirror_map[e.node.get()] = std::move(dequantize_node);
-          } else if (mirror_entry_map.count(e)) {
-            new_node->inputs.emplace_back(
-                NodeEntry{mirror_entry_map[e].node->inputs[0].node, e.index, 
e.version});
-          } else {
-            new_node->inputs.emplace_back(
-                NodeEntry{mirror_node, e.index, e.version});
-          }
+          new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
+          mirror_map[e.node.get()] = std::move(dequantize_node);
+        } else if (mirror_entry_map.count(e)) {
+          new_node->inputs.emplace_back(
+              NodeEntry{mirror_entry_map[e].node->inputs[0].node, e.index, 
e.version});
+        } else {
+          new_node->inputs.emplace_back(
+              NodeEntry{mirror_node, e.index, e.version});
         }
       }
     }
@@ -334,8 +313,7 @@ Graph QuantizeGraph(Graph &&src) {
     }
   }
 
-  if (!offline_params.empty()) outputs =
-    OfflineParams(std::move(outputs), std::move(offline_params));
+  if (!offline_params.empty()) outputs = OfflineParams(std::move(outputs), 
offline_params);
 
   Graph ret;
   ret.outputs = std::move(outputs);
@@ -361,7 +339,11 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
           && 
need_requantize_map[quantized_op_node->op()](quantized_op_node->attrs))
           << quantized_op_node->attrs.name << " op must register 
FNeedRequantize attr"
                                               " and the attr func should 
return true";
-      std::string out_data_name = quantized_op_node->attrs.name + "_";
+      const std::string prefix = "quantized_";
+      CHECK(std::equal(prefix.begin(), prefix.end(), 
quantized_op_node->attrs.name.begin()))
+          << "an quantized op should start with `quantized_`";
+
+      std::string out_data_name = 
quantized_op_node->attrs.name.substr(prefix.size()) + "_";
       auto list_output_names_func = flist_outputs.get(quantized_op_node->op(), 
nullptr);
       // Here it's assumed that the quantized_op node only produces three 
outputs:
       // out_data, min_range, and max_range. So we want to get the 
pre-calculated min_calib_range
@@ -381,6 +363,34 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
         node->attrs.dict["max_calib_range"] = 
std::to_string(calib_table_iter->second.second);
         node->op()->attr_parser(&(node->attrs));
       }
+    } else if (node->op() == Op::Get("_contrib_quantize_v2")) {
+      NodePtr float_op_node = node->inputs[0].node;
+      auto float_op_idx = node->inputs[0].index;
+      std::string out_data_name = float_op_node->attrs.name;
+      if (float_op_node->op()) {
+        auto list_output_names_func = flist_outputs.get(float_op_node->op(), 
nullptr);
+        // We want to get the pre-calculated min_range and max_range from the 
calibration table for
+        // out_data. Here we create the output data name same as its 
constructed in
+        // GraphExecutor::ExecuteMonCallback.
+        if (list_output_names_func != nullptr) {
+          std::vector<std::string> names = 
list_output_names_func(float_op_node->attrs);
+          out_data_name += "_" + names[float_op_idx];
+        } else {
+          out_data_name += "_" + std::to_string(float_op_idx);
+        }
+      }
+      const auto calib_table_iter = calib_table.find(out_data_name);
+      if (calib_table_iter != calib_table.end()) {
+        node->attrs.dict["min_calib_range"] = 
std::to_string(calib_table_iter->second.first);
+        node->attrs.dict["max_calib_range"] = 
std::to_string(calib_table_iter->second.second);
+        node->op()->attr_parser(&(node->attrs));
+        const QuantizeV2Param& param = 
nnvm::get<QuantizeV2Param>(node->attrs.parsed);
+        if (param.out_type == QuantizeV2Param::OutType::kUint8 &&
+            param.min_calib_range.value() < 0.0f) {
+          LOG(WARNING) << "Calibration statistics indicates that node `" << 
node->attrs.name
+                       << "` has negative input, consider use `auto` or `int8` 
as out_type";
+        }
+      }
     }
   });
   return g;
diff --git a/src/operator/quantization/quantize_v2-inl.h 
b/src/operator/quantization/quantize_v2-inl.h
new file mode 100644
index 0000000..5ae10a7
--- /dev/null
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file quantize_v2-inl.h
+ * \brief implementation of quantize operation
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <limits>
+#include "../elemwise_op_common.h"
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "./quantization_utils.h"
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct QuantizeV2Param : public dmlc::Parameter<QuantizeV2Param> {
+  enum OutType { kAuto = 0, kInt8, kUint8 };
+  int out_type;
+  dmlc::optional<float> min_calib_range;
+  dmlc::optional<float> max_calib_range;
+  DMLC_DECLARE_PARAMETER(QuantizeV2Param) {
+    DMLC_DECLARE_FIELD(out_type)
+      .add_enum("auto", kAuto)
+      .add_enum("int8", kInt8)
+      .add_enum("uint8", kUint8)
+      .set_default(kInt8)
+      .describe("Output data type. `auto` can be specified to automatically 
determine output type "
+                "according to min_calib_range.");
+    DMLC_DECLARE_FIELD(min_calib_range)
+      .set_default(dmlc::optional<float>())
+      .describe("The minimum scalar value in the form of float32. If present, 
it will be used to "
+                "quantize the fp32 data into int8 or uint8.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+      .set_default(dmlc::optional<float>())
+      .describe("The maximum scalar value in the form of float32. If present, 
it will be used to "
+                "quantize the fp32 data into int8 or uint8.");
+  }
+};
+
+static mshadow::TypeFlag GetOutputType(const QuantizeV2Param &param) {
+  auto out_type = mshadow::kInt8;
+  if (param.out_type == QuantizeV2Param::OutType::kAuto) {
+    if (param.min_calib_range.has_value() && 
param.max_calib_range.has_value()) {
+      if (param.min_calib_range.value() >= 0.0) {
+        out_type = mshadow::kUint8;
+      } else {
+        out_type = mshadow::kInt8;
+      }
+    }
+  } else if (param.out_type == QuantizeV2Param::OutType::kInt8) {
+    out_type = mshadow::kInt8;
+  } else if (param.out_type == QuantizeV2Param::OutType::kUint8) {
+    out_type = mshadow::kUint8;
+  } else {
+    LOG(FATAL) << "Unsupported out_type in params: " <<param.out_type;
+  }
+  return out_type;
+}
+
+// quantize float to uint8_t
+struct quantize_v2_unsigned {
+  template <typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range, 
float *omax_range,
+                                  const SrcDType *in, const float imin_range,
+                                  const float imax_range, const double 
min_limit,
+                                  const double max_limit) {
+    const float scale = (max_limit - min_limit) / (imax_range - imin_range);
+    out[i] = static_cast<DstDType>((in[i] - imin_range) * scale + 0.5);
+    *omin_range = imin_range;
+    *omax_range = imax_range;
+  }
+
+  template <typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range, 
float *omax_range,
+                                  const SrcDType *in, const float *imin_range,
+                                  const float *imax_range, const double 
min_limit,
+                                  const double max_limit) {
+    Map(i, out, omin_range, omax_range, in, *imin_range, *imax_range, 
min_limit, max_limit);
+  }
+};
+
+// keep zero-center
+struct quantize_v2_zero_centered {
+  template <typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range, 
float *omax_range,
+                                  const SrcDType *in, const float imin_range,
+                                  const float imax_range, const float 
quantized_range) {
+    float real_range = MaxAbs(imin_range, imax_range);
+    float scale = quantized_range / real_range;
+    SrcDType x = in[i];
+    out[i] = static_cast<DstDType>(Sign(x) * Min(Abs(x) * scale + 0.5f, 
quantized_range));
+    *omin_range = -real_range;
+    *omax_range = real_range;
+  }
+
+  template <typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range, 
float *omax_range,
+                                  const SrcDType *in, const float *imin_range,
+                                  const float *imax_range, const float 
quantized_range) {
+    Map(i, out, omin_range, omax_range, in, *imin_range, *imax_range, 
quantized_range);
+  }
+};
+
+template <typename xpu>
+void QuantizeV2Compute(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                       const std::vector<TBlob> &inputs, const 
std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  typedef float SrcDType;
+  using mshadow::red::limits::MaxValue;
+  using mshadow::red::limits::MinValue;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  auto out_type = GetOutputType(param);
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    if (out_type == mshadow::kUint8) {
+      Kernel<quantize_v2_unsigned, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), 
outputs[1].dptr<float>(),
+          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), 
param.min_calib_range.value(),
+          param.max_calib_range.value(), MinValue<uint8_t>(), 
MaxValue<uint8_t>());
+    } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+      Kernel<quantize_v2_zero_centered, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<int8_t>(), 
outputs[1].dptr<float>(),
+          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), 
param.min_calib_range.value(),
+          param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(), 
MinValue<int8_t>()));
+    } else {
+      LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+    }
+  } else {  // model is not calibrated
+    TShape src_shape, dst_shape;
+    const size_t actual_float_size = sizeof(float);
+    const size_t temp_reduce_size =
+        ConfigReduce<xpu, SrcDType>(s, inputs[0].shape_, TShape({1}), 
&src_shape, &dst_shape);
+    Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu, 1, 
char>(
+        Shape1(2 * actual_float_size + temp_reduce_size), s);
+    const int dev_id = ctx.run_ctx.ctx.dev_id;
+    TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_), Shape1(1), 
xpu::kDevMask,
+                   dev_id);
+    TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1, 
Shape1(1), xpu::kDevMask,
+                   dev_id);
+    Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
+                                   Shape1(temp_reduce_size), s);
+    broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+        s, in_min_t.reshape(dst_shape), kWriteTo, workspace, 
inputs[0].reshape(src_shape));
+    broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+        s, in_max_t.reshape(dst_shape), kWriteTo, workspace, 
inputs[0].reshape(src_shape));
+    if (out_type == mshadow::kUint8) {
+      Kernel<quantize_v2_unsigned, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), 
outputs[1].dptr<float>(),
+          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), 
in_min_t.dptr<float>(),
+          in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+    } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+      Kernel<quantize_v2_zero_centered, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<int8_t>(), 
outputs[1].dptr<float>(),
+          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), 
in_min_t.dptr<float>(),
+          in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(), 
MinValue<int8_t>()));
+    } else {
+      LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+    }
+  }
+}
+
+static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs, 
std::vector<TShape> *in_attrs,
+                                   std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 3U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape{1});
+  return !shape_is_none(out_attrs->at(0));
+}
+
+static inline bool QuantizeV2Type(const nnvm::NodeAttrs &attrs, 
std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 3U);
+  const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  auto out_type = GetOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
+  } else if (out_type == mshadow::kInt8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  } else {
+    LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+  }
+  TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
+  return (*in_attrs)[0] != -1;
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
diff --git a/src/operator/quantization/quantize.cc 
b/src/operator/quantization/quantize_v2.cc
similarity index 53%
copy from src/operator/quantization/quantize.cc
copy to src/operator/quantization/quantize_v2.cc
index 5227751..2141093 100644
--- a/src/operator/quantization/quantize.cc
+++ b/src/operator/quantization/quantize_v2.cc
@@ -22,20 +22,19 @@
  * \file quantize.cc
  * \brief
  */
-#include "./quantize-inl.h"
+
+#include "./quantize_v2-inl.h"
 #if MXNET_USE_MKLDNN == 1
-#include "./mkldnn/mkldnn_quantize-inl.h"
+#include "./mkldnn/mkldnn_quantize_v2-inl.h"
 #endif
 
 namespace mxnet {
 namespace op {
-DMLC_REGISTER_PARAMETER(QuantizeParam);
+DMLC_REGISTER_PARAMETER(QuantizeV2Param);
 
-bool QuantizeStorageType(const nnvm::NodeAttrs& attrs,
-                         const int dev_mask,
-                         DispatchMode* dispatch_mode,
-                         std::vector<int> *in_attrs,
-                         std::vector<int> *out_attrs) {
+static bool QuantizeV2StorageType(const nnvm::NodeAttrs& attrs, const int 
dev_mask,
+                                  DispatchMode* dispatch_mode, 
std::vector<int>* in_attrs,
+                                  std::vector<int>* out_attrs) {
   *dispatch_mode = DispatchMode::kFCompute;
 #if MXNET_USE_MKLDNN == 1
   if (dev_mask == mshadow::cpu::kDevMask) {
@@ -48,12 +47,11 @@ bool QuantizeStorageType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-NNVM_REGISTER_OP(_contrib_quantize)
+NNVM_REGISTER_OP(_contrib_quantize_v2)
 .describe(R"code(Quantize a input tensor from float to `out_type`,
-with user-specified `min_range` and `max_range`.
+with user-specified `min_calib_range` and `max_calib_range` or the input range 
collected at runtime.
 
-min_range and max_range are scalar floats that specify the range for
-the input data.
+Output `min_range` and `max_range` are scalar floats that specify the range 
for the input data.
 
 When out_type is `uint8`, the output is calculated using the following 
equation:
 
@@ -70,29 +68,36 @@ where
 `quantized_range = MinAbs(max(int8), min(int8))` and
 `scale = quantized_range / MaxAbs(min_range, max_range).`
 
+When out_type is `auto`, the output type is automatically determined by 
min_calib_range if presented.
+If min_calib_range < 0.0f, the output type will be int8, otherwise will be 
uint8.
+If min_calib_range isn't presented, the output type will be int8.
+
 .. Note::
-    This operator only supports forward propogation. DO NOT use it in 
training.)code" ADD_FILELINE)
-.set_attr_parser(ParamParser<QuantizeParam>)
-.set_num_inputs(3)
+    This operator only supports forward propagation. DO NOT use it in 
training.)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<QuantizeV2Param>)
+.set_num_inputs(1)
 .set_num_outputs(3)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"data", "min_range", "max_range"};
-  })
-.set_attr<nnvm::FInferShape>("FInferShape", QuantizeShape)
-.set_attr<nnvm::FInferType>("FInferType", QuantizeType)
-.set_attr<FInferStorageType>("FInferStorageType", QuantizeStorageType)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) 
{
+  return std::vector<std::string>{"data"};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", QuantizeV2Shape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizeV2Type)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizeV2StorageType)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeCompute)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeV2Compute)
 #endif
-.set_attr<FCompute>("FCompute<cpu>", QuantizeCompute<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", QuantizeV2Compute<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+  const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    return std::vector<ResourceRequest>();
+  } else {
+    return std::vector<ResourceRequest>(1, ResourceRequest::kTempSpace);
+  }
+})
 .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type 
`float32`")
-.add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
-  "possibly produced for the input")
-.add_argument("max_range", "NDArray-or-Symbol", "The maximum scalar value "
-  "possibly produced for the input")
-.add_arguments(QuantizeParam::__FIELDS__());
+.add_arguments(QuantizeV2Param::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/quantize_v2.cu 
b/src/operator/quantization/quantize_v2.cu
new file mode 100644
index 0000000..ab0cf9c
--- /dev/null
+++ b/src/operator/quantization/quantize_v2.cu
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file quantize_v2.cu
+ * \brief
+ */
+#include "./quantize_v2-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_quantize_v2)
+.set_attr<FCompute>("FCompute<gpu>", QuantizeV2Compute<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/requantize-inl.h 
b/src/operator/quantization/requantize-inl.h
index e07a149..148453e 100644
--- a/src/operator/quantization/requantize-inl.h
+++ b/src/operator/quantization/requantize-inl.h
@@ -87,20 +87,6 @@ struct RequantizeKernel {
   }
 };
 
-template<typename xpu, typename DType>
-inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
-                           const TShape& data_shape,
-                           const TShape& out_shape,
-                           TShape* src_shape,
-                           TShape* dst_shape) {
-  BroadcastReduceShapeCompact(data_shape, out_shape, src_shape, dst_shape);
-  constexpr int NDim = 2;
-  CHECK_EQ(src_shape->ndim(), NDim);
-  CHECK_EQ(dst_shape->ndim(), NDim);
-
-  return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo, 
*src_shape);
-}
-
 template<typename xpu>
 void RequantizeForward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc 
b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index 6a8feae..499d739 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -43,10 +43,10 @@ static void UpdateConvWeightBias(NDArray *weight, NDArray 
*bias, bool no_bias,
                                 true, beta.dtype());
   const DType *weight_ptr = weight->data().dptr<DType>();
   const DType *bias_ptr = no_bias ? nullptr : bias->data().dptr<DType>();
-  const DType *gamma_ptr = gamma.Reorder2Default().data().dptr<DType>();
-  const DType *beta_ptr = beta.Reorder2Default().data().dptr<DType>();
-  const DType *mean_ptr = mean.Reorder2Default().data().dptr<DType>();
-  const DType *var_ptr = variance.Reorder2Default().data().dptr<DType>();
+  const DType *gamma_ptr = gamma.data().dptr<DType>();
+  const DType *beta_ptr = beta.data().dptr<DType>();
+  const DType *mean_ptr = mean.data().dptr<DType>();
+  const DType *var_ptr = variance.data().dptr<DType>();
   DType *update_weight_ptr = update_weight.data().dptr<DType>();
   DType *update_bias_ptr = update_bias.data().dptr<DType>();
   size_t channel = gamma.shape()[0];
@@ -77,23 +77,17 @@ static inline size_t GetInSumIndex(const 
MKLDNNConvFusionParam &param) {
 }
 
 template <typename DType>
-static void QuantizeConvWeightBias(NDArray *weight, NDArray *bias,
-                                   bool has_bias, float data_scale,
-                                   bool weight_channelwise_scale,
-                                   std::vector<float> *weight_scales) {
+static std::vector<float> GetWeightScales(const NDArray &weight, bool 
weight_channelwise_scale) {
   using red::limits::MaxValue;
   using red::limits::MinValue;
-  const DType *weight_ptr = weight->data().dptr<DType>();
-  NDArray quantized_weight = NDArray(weight->storage_type(), weight->shape(),
-                                     weight->ctx(), true, mshadow::kInt8);
-  int8_t *quan_weight_ptr = quantized_weight.data().dptr<int8_t>();
-  size_t channel = weight->shape()[0];
+  std::vector<float> weight_scales;
+  const DType *weight_ptr = weight.data().dptr<DType>();
+  size_t channel = weight.shape()[0];
 
   // TODO(Zhennan): Handle the case weight is not in dims 4.
-  size_t offset = weight->shape()[1] * weight->shape()[2] * weight->shape()[3];
+  size_t offset = weight.shape()[1] * weight.shape()[2] * weight.shape()[3];
   std::vector<DType> weight_c_min(channel, MaxValue<DType>());
   std::vector<DType> weight_c_max(channel, MinValue<DType>());
-#pragma omp parallel for 
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int c = 0; c < static_cast<int>(channel); ++c) {
     const DType *p1 = weight_ptr + c * offset;
     for (size_t k = 0; k < offset; ++k) {
@@ -105,16 +99,10 @@ static void QuantizeConvWeightBias(NDArray *weight, 
NDArray *bias,
   }
 
   if (weight_channelwise_scale) {
-    weight_scales->resize(channel);
-#pragma omp parallel for 
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    weight_scales.resize(channel);
     for (int c = 0; c < static_cast<int>(channel); ++c) {
       DType weight_range = MaxAbs(weight_c_min[c], weight_c_max[c]);
-      weight_scales->at(c) = kInt8Range / weight_range;
-      const DType *fp_ptr = weight_ptr + c * offset;
-      int8_t *quan_ptr = quan_weight_ptr + c * offset;
-      for (size_t k = 0; k < offset; ++k) {
-        quan_ptr[k] = std::round(weight_scales->at(c) * fp_ptr[k]);
-      }
+      weight_scales[c] = kInt8Range / weight_range;
     }
   } else {
     DType total_min = weight_c_min[0];
@@ -123,74 +111,73 @@ static void QuantizeConvWeightBias(NDArray *weight, 
NDArray *bias,
       if (total_min > weight_c_min[c]) total_min = weight_c_min[c];
       if (total_max < weight_c_max[c]) total_max = weight_c_max[c];
     }
-    weight_scales->resize(1);
+    weight_scales.resize(1);
     DType weight_range = MaxAbs(total_min, total_max);
-    weight_scales->at(0) = kInt8Range / weight_range;
-#pragma omp parallel for 
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-    for (int c = 0; c < static_cast<int>(channel); ++c) {
-      const DType *fp_ptr = weight_ptr + c * offset;
-      int8_t *quan_ptr = quan_weight_ptr + c * offset;
-      for (size_t k = 0; k < offset; ++k) {
-        quan_ptr[k] = std::round(weight_scales->at(0) * fp_ptr[k]);
-      }
-    }
-  }
-
-  *weight = quantized_weight;
-  if (has_bias) {
-    const DType *bias_ptr = bias->data().dptr<DType>();
-    NDArray quantized_bias = NDArray(bias->storage_type(), bias->shape(),
-                                     bias->ctx(), true, mshadow::kInt32);
-    int32_t *quan_bias_ptr = quantized_bias.data().dptr<int32_t>();
-    for (size_t c = 0; c < channel; ++c) {
-      auto weight_scale =
-          weight_channelwise_scale ? weight_scales->at(c) : 
weight_scales->at(0);
-      float bias_scale = weight_scale * data_scale;
-      quan_bias_ptr[c] = std::round(bias_scale * bias_ptr[c]);
-    }
-    *bias = quantized_bias;
+    weight_scales[0] = kInt8Range / weight_range;
   }
+  return weight_scales;
 }
 
-static void ConvFusionFallBackCompute() {
-  LOG(FATAL) << "Don't know how to do ConvFusionFallBackCompute!";
-}
-
-static void ConvolutionFusionComputeExCPU(const MKLDNNConvFullParam 
&full_param,
-                                          const OpContext &ctx,
-                                          MKLDNNConvForward *fwd,
-                                          const std::vector<NDArray> &inputs,
-                                          const std::vector<OpReqType> &req,
-                                          const std::vector<NDArray> &outputs) 
{
-  if (SupportMKLDNNConv(full_param.conv_param, inputs[0])) {
-    MKLDNNConvolutionForwardFullFeature(full_param, ctx, fwd, inputs, req, 
outputs);
-    return;
+static void ConvertWeightBias2MKLDNN(const MKLDNNConvFullParam &param,
+                                     
mkldnn::convolution_forward::primitive_desc fwd_pd,
+                                     NDArray *weight, NDArray *bias, bool 
has_bias,
+                                     float data_scale, const 
std::vector<float> &weight_scales) {
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  const auto new_weight = NDArray(fwd_pd.weights_primitive_desc());
+  const auto conv_weights_memory = new_weight.GetMKLDNNData();
+  primitive_attr weight_attr;
+  if (weight_scales.size()) {
+    const int weight_mask = (weight_scales.size()) == 1 ? 0 : 1;
+    weight_attr.set_int_output_round_mode(round_mode::round_nearest);
+    weight_attr.set_output_scales(weight_mask, weight_scales);
+  }
+  auto default_weights_memory = GetWeights(*weight, 
param.conv_param.num_group);
+  if (default_weights_memory == nullptr) default_weights_memory = 
weight->GetMKLDNNData();
+  const auto weight_reorder_pd =
+      
mkldnn::reorder::primitive_desc(default_weights_memory->get_primitive_desc(),
+                                      
conv_weights_memory->get_primitive_desc(), weight_attr);
+  stream->RegisterPrim(
+      mkldnn::reorder(weight_reorder_pd, *default_weights_memory, 
*conv_weights_memory));
+
+  NDArray new_bias;
+  if (has_bias && data_scale) {
+    std::vector<float> bias_scales(weight_scales.size());
+    for (size_t c = 0; c < weight_scales.size(); ++c) {
+      bias_scales[c] = weight_scales[c] * data_scale;
+    }
+    new_bias = NDArray(fwd_pd.bias_primitive_desc());
+    const auto conv_bias_memory = new_bias.GetMKLDNNData();
+    const int bias_mask = (bias_scales.size()) == 1 ? 0 : 1;
+    primitive_attr bias_attr;
+    bias_attr.set_int_output_round_mode(round_mode::round_nearest);
+    bias_attr.set_output_scales(bias_mask, bias_scales);
+    auto bias_weights_memory = bias->GetMKLDNNData();
+    auto bias_reorder_pd =
+        
mkldnn::reorder::primitive_desc(bias_weights_memory->get_primitive_desc(),
+                                        
conv_bias_memory->get_primitive_desc(), bias_attr);
+    stream->RegisterPrim(
+        mkldnn::reorder(bias_reorder_pd, *bias_weights_memory, 
*conv_bias_memory));
   }
-  ConvFusionFallBackCompute();
+  stream->Submit();
+  *weight = new_weight;
+  if (has_bias && data_scale) *bias = new_bias;
 }
 
 class SgMKLDNNConvOperator {
  public:
   explicit SgMKLDNNConvOperator(const nnvm::NodeAttrs &attrs)
-      : initalized_(false),
-        subgraph_sym_(*attrs.subgraphs[0]),
-        param_(nnvm::get<MKLDNNConvFusionParam>(attrs.parsed)),
-        inplace_(false) {}
+      : subgraph_sym_(*attrs.subgraphs[0]),
+        param_(nnvm::get<MKLDNNConvFusionParam>(attrs.parsed)) {}
 
   void Forward(const OpContext &ctx,
                const std::vector<NDArray> &inputs,
                const std::vector<OpReqType> &req,
                const std::vector<NDArray> &outputs);
 
-  void Backward(const OpContext &ctx, const std::vector<NDArray> &inputs,
-                const std::vector<OpReqType> &req,
-                const std::vector<NDArray> &outputs) {
-    LOG(FATAL) << "Not implemented: subgraph mkldnn Conv only supports "
-                  "inference computation.";
-  }
-
  private:
-  bool initalized_;
+  bool initalized_{false};
+  bool inplace_{false};
+  bool post_requantize_{false};
   nnvm::Symbol subgraph_sym_;
   MKLDNNConvFusionParam param_;
   std::shared_ptr<MKLDNNConvForward> fwd_;
@@ -200,10 +187,12 @@ class SgMKLDNNConvOperator {
   float cached_data_max_;
   float cached_sum_min_;
   float cached_sum_max_;
+  float cached_output_min_;
+  float cached_output_max_;
   size_t weight_ver_;
   size_t bias_ver_;
+  float data_scale_{0.0f};
   std::vector<float> weight_scales_;
-  bool inplace_;
 };
 
 void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
@@ -239,10 +228,6 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
   float sum_max = (mkldnn_param.with_sum && mkldnn_param.quantized)
                       ? inputs[idx++].data().dptr<float>()[0]
                       : 0.0;
-  float *out_min_ptr =
-      mkldnn_param.quantized ? outputs[kMin].data().dptr<float>() : nullptr;
-  float *out_max_ptr =
-      mkldnn_param.quantized ? outputs[kMax].data().dptr<float>() : nullptr;
   CHECK_EQ(input_size, idx);
   bool has_bias = mkldnn_param.with_bn || !conv_param.no_bias;
   NDArray data = inputs[in_data];
@@ -251,10 +236,10 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
   // Copy inputs[in_sum] into outputs[kOut] in case inplace optimization 
failed.
   if (mkldnn_param.with_sum) {
     if (!initalized_) {
-      auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
-      auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
       // TODO(zhennan): Currently, mkldnn fallback mechanism will break 
inplace option,
       // which make check (req[kOut] == kWriteInplace) useless.
+      auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
+      auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
       if (in_mkl_mem->get_data_handle() == out_mkl_mem->get_data_handle()) {
         inplace_ = true;
       }
@@ -288,19 +273,6 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
       }
     }
   }
-  bool post_requantize = false;
-  if (mkldnn_param.quantized) {
-    if (mkldnn_param.min_calib_range.has_value() &&
-        mkldnn_param.max_calib_range.has_value()) {
-      post_requantize = true;
-      mkldnn_param.weight_channelwise_scale = true;
-      *out_min_ptr = mkldnn_param.min_calib_range.value();
-      *out_max_ptr = mkldnn_param.max_calib_range.value();
-    } else {
-      mkldnn_param.weight_channelwise_scale = false;
-    }
-  }
-
   if (!initalized_) {
     cached_data_min_ = data_min;
     cached_data_max_ = data_max;
@@ -310,7 +282,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     cached_weight_ = inputs[in_weight].Reorder2Default();
     weight_ver_ = inputs[in_weight].version();
     if (!conv_param.no_bias) {
-      cached_bias_ = inputs[in_bias].Reorder2Default();
+      cached_bias_ = inputs[in_bias];
       bias_ver_ = inputs[in_bias].version();
     } else {
       cached_bias_ = NDArray();
@@ -331,13 +303,22 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     // Quantize weight and bias.
     if (mkldnn_param.quantized) {
       CHECK(data.dtype() == mshadow::kInt8 || data.dtype() == mshadow::kUint8);
+      if (cached_data_min_ < 0.0f) {
+        CHECK_EQ(data.dtype(), mshadow::kInt8)
+            << "Expect int8 when data_min < 0.0, consider quantize model with 
int8.";
+      }
+      auto weight_channelwise_scale = false;
+      if (mkldnn_param.min_calib_range.has_value() && 
mkldnn_param.max_calib_range.has_value()) {
+        cached_output_min_ = mkldnn_param.min_calib_range.value();
+        cached_output_max_ = mkldnn_param.max_calib_range.value();
+        post_requantize_ = true;
+        weight_channelwise_scale = true;
+      }
       auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : 
kUint8Range;
-      float data_scale = data_range / MaxAbs(cached_data_min_, 
cached_data_max_);
+      data_scale_ = data_range / MaxAbs(cached_data_min_, cached_data_max_);
       MSHADOW_REAL_TYPE_SWITCH(cached_weight_.dtype(), DType, {
-        QuantizeConvWeightBias<DType>(&cached_weight_, &cached_bias_,
-                                      has_bias, data_scale,
-                                      mkldnn_param.weight_channelwise_scale,
-                                      &weight_scales_);
+        weight_scales_ =
+            GetWeightScales<DType>(cached_weight_, weight_channelwise_scale);
       });
       // Collect scale.
       size_t channel = cached_weight_.shape()[0];
@@ -345,29 +326,20 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
       float out_range;
       float quantized_out_range;
       float output_scale;
-      if (cached_data_min_ < 0.0) {
-        // TODO(zhennan): Support int8 input when mkldnn supports.
-        LOG(FATAL) << "Can't handle negetive value for QuantizeData";
-      }
       if (mkldnn_param.with_sum) {
         auto quantized_sum_range = cached_sum_min_ < 0 ? kInt8Range : 
kUint8Range;
         sum_in_scale = quantized_sum_range / MaxAbs(cached_sum_min_, 
cached_sum_max_);
       }
-      if (post_requantize) {
-        quantized_out_range =
-            IsOutputUInt8(mkldnn_param) ? kUint8Range : kInt8Range;
-        out_range = MaxAbs(*out_min_ptr, *out_max_ptr);
+      if (post_requantize_) {
+        quantized_out_range = IsOutputUInt8(mkldnn_param) ? kUint8Range : 
kInt8Range;
+        out_range = MaxAbs(cached_output_min_, cached_output_max_);
         output_scale = quantized_out_range / out_range;
-        full_conv_param.requantize_scales.resize(channel);
-        for (size_t c = 0; c < channel; c++) {
-          auto weight_scale = mkldnn_param.weight_channelwise_scale
-                                  ? weight_scales_[c]
-                                  : weight_scales_[0];
-          full_conv_param.requantize_scales[c] =
-              output_scale / data_scale / weight_scale;
+        full_conv_param.requantize_scales.resize(weight_channelwise_scale ? 
channel : 1);
+        for (size_t c = 0; c < full_conv_param.requantize_scales.size(); c++) {
+          full_conv_param.requantize_scales[c] = output_scale / data_scale_ / 
weight_scales_[c];
         }
       } else {
-        output_scale = data_scale * weight_scales_[0];
+        output_scale = data_scale_ * weight_scales_[0];
         full_conv_param.requantize_scales.resize(0);
       }
       if (mkldnn_param.with_sum)
@@ -376,20 +348,39 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     fwd_.reset(new MKLDNNConvForward(
         full_conv_param, ctx.is_train, data, cached_weight_,
         has_bias ? &cached_bias_ : nullptr, output));
+    ConvertWeightBias2MKLDNN(full_conv_param, fwd_->fwd_pd, &cached_weight_, 
&cached_bias_,
+                             has_bias, data_scale_, weight_scales_);
+    fwd_->SetNewMem(*data.GetMKLDNNData(), *cached_weight_.GetMKLDNNData(),
+                    has_bias ? cached_bias_.GetMKLDNNData() : nullptr,
+                    *output.GetMKLDNNData());
+    initalized_ = true;
   }
-  initalized_ = true;
-  std::vector<NDArray> new_inputs;
-  std::vector<OpReqType> new_req;
-  if (has_bias) {
-    new_inputs = {data, cached_weight_, cached_bias_};
-    new_req = {req[in_data], req[in_weight], req[in_bias]};
+
+  if (mkldnn_param.quantized) {
+    auto data_mem = 
data.GetMKLDNNDataReorder(fwd_->fwd_pd.src_primitive_desc());
+    mkldnn::memory *mem = 
output.CreateMKLDNNData(fwd_->fwd_pd.dst_primitive_desc());
+    fwd_->SetNewMem(*data_mem, *mem);
+    MKLDNNStream::Get()->RegisterPrim(fwd_->GetFwd());
+    MKLDNNStream::Get()->Submit();
   } else {
-    new_inputs = {data, cached_weight_};
-    new_req = {req[in_data], req[in_weight]};
+    std::vector<NDArray> new_inputs;
+    std::vector<OpReqType> new_req;
+    if (has_bias) {
+      new_inputs = {data, cached_weight_, cached_bias_};
+      new_req = {req[in_data], req[in_weight], req[in_bias]};
+    } else {
+      new_inputs = {data, cached_weight_};
+      new_req = {req[in_data], req[in_weight]};
+    }
+    MKLDNNConvolutionForwardFullFeature(full_conv_param, ctx, fwd_.get(), 
new_inputs, new_req,
+                                        {output});
+  }
+  if (post_requantize_) {
+  float *out_min_ptr = outputs[kMin].data().dptr<float>();
+  float *out_max_ptr = outputs[kMax].data().dptr<float>();
+  *out_min_ptr = cached_output_min_;
+  *out_max_ptr = cached_output_max_;
   }
-  ConvolutionFusionComputeExCPU(full_conv_param, ctx, fwd_.get(), new_inputs,
-                                new_req, {output});
-
   if (mkldnn_param.with_sum) {
     auto out = const_cast<NDArray &>(outputs[kOut]);
     auto format = static_cast<mkldnn::memory::format>(
@@ -411,7 +402,7 @@ static uint32_t SgMKLDNNConvNumInputs(const NodeAttrs 
&attrs) {
   auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
   auto num_input = DefaultSubgraphOpNumInputs(attrs);
   if (param.full_conv_param.mkldnn_param.quantized)
-    return num_input + 2 + param.full_conv_param.mkldnn_param.with_sum ? 2 : 0;
+    return num_input + 2 + (param.full_conv_param.mkldnn_param.with_sum ? 2 : 
0);
   else
     return num_input;
 }
@@ -431,6 +422,7 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs *attrs) 
{
     os << ")";
     throw dmlc::ParamError(os.str());
   }
+  CHECK_EQ(attrs->subgraphs.size(), 1);
   auto subgraph_sym = attrs->subgraphs[0];
   DFSVisit(subgraph_sym->outputs, [&](const nnvm::NodePtr &node) {
     if (node->is_variable()) return;
@@ -448,10 +440,23 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs 
*attrs) {
   attrs->parsed = std::move(param_);
 }
 
-static std::vector<std::string> SgMKLDNNConvListInputNames(
-    const NodeAttrs &attrs) {
+static std::vector<std::string> SgMKLDNNConvListInputNames(const NodeAttrs 
&attrs) {
   auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
-  std::vector<std::string> input_names = DefaultSubgraphOpListInputs(attrs);
+  std::vector<std::string> input_names;
+  input_names.emplace_back("data");
+  input_names.emplace_back("weight");
+  if (!param.full_conv_param.conv_param.no_bias) {
+    input_names.emplace_back("bias");
+  }
+  if (param.full_conv_param.mkldnn_param.with_bn) {
+    input_names.emplace_back("gamma");
+    input_names.emplace_back("beta");
+    input_names.emplace_back("mean");
+    input_names.emplace_back("var");
+  }
+  if (param.full_conv_param.mkldnn_param.with_sum) {
+    input_names.emplace_back("sum");
+  }
   if (param.full_conv_param.mkldnn_param.quantized) {
     input_names.emplace_back("data_min");
     input_names.emplace_back("data_max");
@@ -460,6 +465,7 @@ static std::vector<std::string> SgMKLDNNConvListInputNames(
       input_names.emplace_back("sum_max");
     }
   }
+  CHECK_EQ(input_names.size(), SgMKLDNNConvNumInputs(attrs));
   return input_names;
 }
 
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc 
b/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
index e5220f2..e462191 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
@@ -66,17 +66,21 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
   }
 
   bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
-    if (status == kFail || status == kSuccess || new_node.is_variable())
-      return false;
     // If n isn't the last matched node, then we encoutered a internal
     // branch, we should pop out the node behind n and stop fusion.
     if (matched_list.back() != &n) {
-      while (matched_list.back() != &n) {
-        matched_list.pop_back();
+      if (std::find(matched_list.begin(), matched_list.end(), &n) !=
+          matched_list.end()) {
+        while (matched_list.back() != &n) {
+          matched_list.pop_back();
+        }
       }
       status = kSuccess;
       return false;
     }
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+
     // Use status machine to do selection. The status change is
     // kStart -> kBN -> kSum -> kSuccess
     switch (status) {
@@ -99,12 +103,11 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
               nnvm::get<ActivationParam>(new_node.attrs.parsed);
           if (param.act_type == activation::kReLU) {
             matched_list.push_back(&new_node);
-            // If we find conv+relu, then we can't match bn anymore.
-            if (status == kStart) status = kBN;
-            return true;
-          } else {
+            // If we find conv+relu, then we can't match anymore.
+            // TODO(zhennan): mkldnn only supports convolution + relu + sum in
+            // int8, not in fp32. So we disable this pattern at moment.
             status = kSuccess;
-            return false;
+            return true;
           }
         }
         status = kSuccess;
@@ -117,7 +120,15 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
     if (status == kFail) {
       return std::vector<nnvm::Node *>(0);
     } else {
-      return candidates;
+      std::vector<nnvm::Node *> ret;
+      for (auto i : matched_list) {
+        auto non_const_i = const_cast<nnvm::Node *>(i);
+        if (std::find(candidates.begin(), candidates.end(), non_const_i) !=
+            candidates.end()) {
+          ret.push_back(non_const_i);
+        }
+      }
+      return ret;
     }
   }
 };
@@ -130,8 +141,7 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
     disable_conv_relu = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_RELU", 0);
     disable_conv_sum = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_SUM", 0);
 
-    disable_all =
-        disable_all && disable_conv_bn && disable_conv_relu && 
disable_conv_sum;
+    disable_all = disable_all || (disable_conv_bn && disable_conv_relu && 
disable_conv_sum);
     if (disable_all) {
       LOG(INFO) << "MKLDNN Convolution optimization pass is disabled.";
     } else {
diff --git a/tests/python/mkl/test_subgraph.py 
b/tests/python/mkl/test_subgraph.py
index be6feae..313668c 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -35,14 +35,14 @@ from mxnet.test_utils import assert_almost_equal
 
 DATA_SHAPE=[(4, 4, 10, 10), (32, 3, 24, 24), (64, 8, 64, 64)]
 
-def check_qsym_calibrated(qsym):
+def check_qsym_calibrated(qsym, out_type):
   assert ''.join(qsym.attr_dict().keys()).find('quantized_sg_mkldnn_conv') != 
-1
   for k, v in qsym.attr_dict().items():
     if k.find('quantized_sg_mkldnn_conv') != -1:
       assert 'min_calib_range' in v
       assert 'max_calib_range' in v
     if k.find('_quantize') != -1:
-      assert v['out_type'] == 'uint8'
+      assert v['out_type'] == out_type
 
 def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, 
label_shape):
   mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
@@ -66,7 +66,7 @@ def check_qsym_dummy_forward(qsym, batch, data_shape, 
label_shape):
     output.wait_to_read()
   return mod.get_outputs()
 
-def check_quantize(sym, data_shape, check_conv=True):
+def check_quantize(sym, data_shape, out_type, check_conv=True):
   fc = mx.sym.FullyConnected(data=sym, num_hidden=10, flatten=True, name='fc')
   sym = mx.sym.SoftmaxOutput(data=fc, name='softmax')
   sym_sg = sym.get_backend_symbol("MKLDNN")
@@ -99,15 +99,14 @@ def check_quantize(sym, data_shape, check_conv=True):
                                                                    
aux_params=aux_params,
                                                                    
ctx=mx.current_context(),
                                                                    
excluded_sym_names=excluded_sym_names,
-                                                                   
quantized_dtype='uint8',
+                                                                   
quantized_dtype=out_type,
                                                                    
calib_mode='naive',
                                                                    
calib_data=calib_data,
                                                                    
calib_layer=calib_layer,
-                                                                   
calib_quantize_op=True,
                                                                    
num_calib_examples=5)
   qsym = qsym.get_backend_symbol("MKLDNN_POST_QUANTIZE")
   if check_conv:
-    check_qsym_calibrated(qsym)
+    check_qsym_calibrated(qsym, out_type)
   quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, 
data_shape, label_shape)
   for i in range(len(ref_out)):
     assert_almost_equal(ref_out[i].asnumpy(), quantized_out[i].asnumpy(), atol 
= 1)
@@ -135,8 +134,9 @@ def check_fusion(sym, data_shape, attrs_op):
   for i in range(len(exe.outputs)):
     assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), 
rtol=1e-3, atol=1e-3)
 
-  # fp32 to uint8
-  check_quantize(sym, data_shape)
+  # fp32 to int8
+  for out_type in ('uint8', 'int8', 'auto'):
+    check_quantize(sym, data_shape, out_type)
 
 def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None, 
date_shape=(4,4,10,10)):
   for sym, attrs, excluded_attr in zip(syms, attrs_name, excluded_attrs):
@@ -475,12 +475,13 @@ def test_pos_conv_bn_sum_relu():
 
 def test_pos_single_concat():
   for data_shape in DATA_SHAPE:
-    net = single_concat(data_shape, 2, 1)
-    check_quantize(net, data_shape, False)
-    net = single_concat(data_shape, 4, 2)
-    check_quantize(net, data_shape, False)
-    net = single_concat(data_shape, 4, 3)
-    check_quantize(net, data_shape, False)
+    for out_type in ('uint8', 'int8', 'auto'):
+      net = single_concat(data_shape, 2, 1)
+      check_quantize(net, data_shape, out_type, False)
+      net = single_concat(data_shape, 4, 2)
+      check_quantize(net, data_shape, out_type, False)
+      net = single_concat(data_shape, 4, 3)
+      check_quantize(net, data_shape, out_type, False)
 
 @with_seed()
 def test_neg_conv_bn():
diff --git a/tests/python/unittest/test_operator.py 
b/tests/python/unittest/test_operator.py
index 7b5b9eb..1f42215 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -6821,7 +6821,7 @@ def test_op_output_names_monitor():
             output_names.append(py_str(name))
 
         op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
-        op_exe.set_monitor_callback(get_output_names_callback)
+        op_exe.set_monitor_callback(get_output_names_callback, 
monitor_all=False)
         op_exe.forward()
         for output_name, expected_name in zip(output_names, expected_names):
             assert output_name == expected_name
@@ -6859,6 +6859,51 @@ def test_op_output_names_monitor():
                             name='pooling')
     check_name(us_sym, ['pooling_output'])
 
+def test_op_all_names_monitor():
+    def check_name(op_sym, expected_names):
+        output_names = []
+
+        def get_output_names_callback(name, arr):
+            output_names.append(py_str(name))
+
+        op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+        op_exe.set_monitor_callback(get_output_names_callback, 
monitor_all=True)
+        op_exe.forward()
+        for output_name, expected_name in zip(output_names, expected_names):
+            assert output_name == expected_name
+
+    data = mx.sym.Variable('data', shape=(10, 3, 10, 10))
+    conv_sym = mx.sym.Convolution(data, kernel=(2, 2), num_filter=1, 
name='conv')
+    check_name(conv_sym, ['data', 'conv_data', 'conv_weight', 'conv_weight', 
'conv_bias', 'conv_bias', 'conv_output'])
+
+    deconv_sym = mx.sym.Deconvolution(data, kernel=(2, 2), num_filter=1, 
name='deconv')
+    check_name(deconv_sym, ['data', 'deconv_data', 'deconv_weight', 
'deconv_weight', 'deconv_output'])
+
+    fc_sym = mx.sym.FullyConnected(data, num_hidden=10, name='fc')
+    check_name(fc_sym, ['data', 'fc_data', 'fc_weight', 'fc_weight', 
'fc_bias', 'fc_bias', 'fc_output'])
+
+    lrn_sym = mx.sym.LRN(data, nsize=1, name='lrn')
+    check_name(lrn_sym, ['data', 'lrn_data', 'lrn_output', 'lrn_tmp_norm'])
+
+    act_sym = mx.sym.Activation(data, act_type='relu', name='act')
+    check_name(act_sym, ['data', 'act_input0', 'act_output'])
+
+    cc_sym = mx.sym.concat(data, data, dim=0, name='concat')
+    check_name(cc_sym, ['data', 'concat_arg0', 'data', 'concat_arg1', 
'concat_output'])
+
+    sm_sym = mx.sym.softmax(data, name='softmax')
+    check_name(sm_sym, ['data', 'softmax_input0', 'softmax_output'])
+
+    sa_sym = mx.sym.SoftmaxActivation(data, name='softmax')
+    check_name(sa_sym, ['data', 'softmax_input0', 'softmax_output'])
+
+    us_sym = mx.sym.UpSampling(data, scale=2, sample_type='nearest',
+                               name='upsampling')
+    check_name(us_sym, ['data', 'upsampling_arg0', 'upsampling_output'])
+
+    us_sym = mx.sym.Pooling(data, kernel=(2, 2), pool_type='avg',
+                            name='pooling')
+    check_name(us_sym, ['data', 'pooling_data', 'pooling_output'])
 
 @with_seed()
 def test_activation():

[incubator-mxnet] branch master updated: [MKLDNN] Enable signed int8 support for convolution. (#13697)

Reply via email to