This is an automated email from the ASF dual-hosted git repository.
patriczhao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new a083a61 [MKLDNN]Improve quantizeV2 and dequantize latency (#14641)
a083a61 is described below
commit a083a61fb295d7db08443a7464c64f8bf4902320
Author: Zhennan Qin <[email protected]>
AuthorDate: Wed Apr 17 20:14:03 2019 -0500
[MKLDNN]Improve quantizeV2 and dequantize latency (#14641)
* stateful_quantize
* fix lint
* Fix build
* fix gpu build
* Fix typo
* Move check to online calibration
---
src/operator/quantization/dequantize-inl.h | 62 ++++---
src/operator/quantization/dequantize.cc | 21 ++-
src/operator/quantization/dequantize.cu | 2 +-
.../quantization/mkldnn/mkldnn_dequantize-inl.h | 140 ++++++++------
.../quantization/mkldnn/mkldnn_quantize_v2-inl.h | 202 +++++++++++----------
src/operator/quantization/quantize_v2-inl.h | 185 ++++++++++---------
src/operator/quantization/quantize_v2.cc | 21 ++-
src/operator/quantization/quantize_v2.cu | 2 +-
src/operator/subgraph/mkldnn/mkldnn_conv.cc | 16 +-
9 files changed, 377 insertions(+), 274 deletions(-)
diff --git a/src/operator/quantization/dequantize-inl.h
b/src/operator/quantization/dequantize-inl.h
index 7c91ad5..92b74b7 100644
--- a/src/operator/quantization/dequantize-inl.h
+++ b/src/operator/quantization/dequantize-inl.h
@@ -68,30 +68,6 @@ struct dequantize_zero_centered {
}
};
-template<typename xpu>
-void DequantizeCompute(const nnvm::NodeAttrs& attrs,
- const OpContext& ctx,
- const std::vector<TBlob>& inputs,
- const std::vector<OpReqType>& req,
- const std::vector<TBlob>& outputs) {
- using namespace mshadow;
- using namespace mxnet_op;
- using mshadow::red::limits::MinValue;
- using mshadow::red::limits::MaxValue;
- Stream<xpu> *s = ctx.get_stream<xpu>();
- if (inputs[0].type_flag_ == mshadow::kUint8) {
- Kernel<dequantize_unsigned, xpu>::Launch(s, outputs[0].Size(),
outputs[0].dptr<float>(),
- inputs[0].dptr<uint8_t>(), inputs[1].dptr<float>(),
inputs[2].dptr<float>(),
- MinValue<uint8_t>(), MaxValue<uint8_t>());
- } else if (inputs[0].type_flag_ == mshadow::kInt8) {
- Kernel<dequantize_zero_centered, xpu>::Launch(s, outputs[0].Size(),
outputs[0].dptr<float>(),
- inputs[0].dptr<int8_t>(), inputs[1].dptr<float>(),
inputs[2].dptr<float>(),
- MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
- } else {
- LOG(FATAL) << "dequantize op only supports input type int8 or uint8";
- }
-}
-
inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
mxnet::ShapeVector *in_attrs,
mxnet::ShapeVector *out_attrs) {
@@ -119,6 +95,44 @@ inline bool DequantizeType(const nnvm::NodeAttrs& attrs,
return (*in_attrs)[0] != -1;
}
+template <typename xpu>
+class DequantizeOperator {
+ public:
+ explicit DequantizeOperator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
+ void Forward(const OpContext &ctx, const std::vector<TBlob> &inputs,
+ const std::vector<OpReqType> &req, const std::vector<TBlob>
&outputs) {
+ using namespace mshadow;
+ using namespace mxnet_op;
+ using mshadow::red::limits::MaxValue;
+ using mshadow::red::limits::MinValue;
+ Stream<xpu> *s = ctx.get_stream<xpu>();
+ if (inputs[0].type_flag_ == mshadow::kUint8) {
+ Kernel<dequantize_unsigned, xpu>::Launch(s, outputs[0].Size(),
outputs[0].dptr<float>(),
+ inputs[0].dptr<uint8_t>(),
inputs[1].dptr<float>(),
+ inputs[2].dptr<float>(),
MinValue<uint8_t>(),
+ MaxValue<uint8_t>());
+ } else if (inputs[0].type_flag_ == mshadow::kInt8) {
+ Kernel<dequantize_zero_centered, xpu>::Launch(
+ s, outputs[0].Size(), outputs[0].dptr<float>(),
inputs[0].dptr<int8_t>(),
+ inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+ MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+ } else {
+ LOG(FATAL) << "dequantize op only supports input type int8 or uint8";
+ }
+ }
+
+ private:
+ nnvm::NodeAttrs attrs_;
+};
+
+template <typename xpu>
+static void DequantizeForward(const OpStatePtr &state_ptr, const OpContext
&ctx,
+ const std::vector<TBlob> &inputs, const
std::vector<OpReqType> &req,
+ const std::vector<TBlob> &outputs) {
+ auto &op = state_ptr.get_state<DequantizeOperator<xpu>>();
+ op.Forward(ctx, inputs, req, outputs);
+}
+
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_QUANTIZATION_DEQUANTIZE_INL_H_
diff --git a/src/operator/quantization/dequantize.cc
b/src/operator/quantization/dequantize.cc
index 7c84673..dd433e4 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -48,6 +48,22 @@ bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
return true;
}
+static OpStatePtr CreateDequantizeState(const nnvm::NodeAttrs &attrs, Context
ctx,
+ const std::vector<TShape> &in_shapes,
+ const std::vector<int> &in_types) {
+ OpStatePtr state;
+ if (ctx.dev_type == kGPU) {
+ state = OpStatePtr::Create<DequantizeOperator<gpu>>(attrs);
+ } else {
+#if MXNET_USE_MKLDNN == 1
+ state = OpStatePtr::Create<SgMKLDNNDequantizeOperator>(attrs);
+#else
+ state = OpStatePtr::Create<DequantizeOperator<cpu>>(attrs);
+#endif
+ }
+ return state;
+}
+
NNVM_REGISTER_OP(_contrib_dequantize)
.describe(R"code(Dequantize the input tensor into a float tensor.
min_range and max_range are scalar floats that specify the range for
@@ -74,11 +90,12 @@ by keep zero centered for the quantized value:
// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
// will be reverted after the improvement of CachedOP is done.
.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FCreateOpState>("FCreateOpState", CreateDequantizeState)
#if MXNET_USE_MKLDNN == 1
.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNDequantizeCompute)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>",
SgMKLDNNDequantizeForward)
#endif
-.set_attr<FCompute>("FCompute<cpu>", DequantizeCompute<cpu>)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", DequantizeForward<cpu>)
.add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
.add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
"possibly produced for the input in float32")
diff --git a/src/operator/quantization/dequantize.cu
b/src/operator/quantization/dequantize.cu
index ca5f91c..dee8b22 100644
--- a/src/operator/quantization/dequantize.cu
+++ b/src/operator/quantization/dequantize.cu
@@ -28,7 +28,7 @@ namespace mxnet {
namespace op {
NNVM_REGISTER_OP(_contrib_dequantize)
-.set_attr<FCompute>("FCompute<gpu>", DequantizeCompute<gpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DequantizeForward<gpu>);
} // namespace op
} // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
index b66adf7..27fa070 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -26,82 +26,104 @@
#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
#if MXNET_USE_MKLDNN == 1
-#include <string>
#include <algorithm>
+#include <string>
#include <vector>
#include "../../nn/mkldnn/mkldnn_base-inl.h"
namespace mxnet {
namespace op {
-template<typename SrcType, typename DstType>
-static void MKLDNNDequantizeComputeKer(const std::vector<NDArray> &inputs,
- const std::vector<NDArray> &outputs,
- const std::vector<OpReqType> &req) {
- using namespace mshadow;
- using namespace mxnet_op;
- using red::limits::MaxValue;
- using red::limits::MinValue;
- float real_range = 0.0;
- float quantized_range = 0.0;
- if (inputs[0].dtype() == mshadow::kUint8) {
- quantized_range = MaxAbs(MaxValue<SrcType>(), MinValue<SrcType>());
- real_range = MaxAbs(*inputs[1].data().dptr<DstType>(),
*inputs[2].data().dptr<DstType>());
- } else if (inputs[0].dtype() == mshadow::kInt8) {
- quantized_range = MinAbs(MaxValue<SrcType>(), MinValue<SrcType>());
- real_range = MaxAbs(*inputs[1].data().dptr<DstType>(),
*inputs[2].data().dptr<DstType>());
- } else {
- LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as output
type";
- }
- float scale = real_range / quantized_range;
- primitive_attr attr;
- const int mask = 0;
- std::vector<float> scales = {scale};
- attr.set_output_scales(mask, scales);
- attr.set_int_output_round_mode(round_nearest);
- mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
- NDArray in_buffer = inputs[0];
- if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
- in_buffer = inputs[0].Reorder2Default();
+class SgMKLDNNDequantizeOperator {
+ public:
+ explicit SgMKLDNNDequantizeOperator(const nnvm::NodeAttrs &attrs)
+ : param_(nnvm::get<DequantizeParam>(attrs.parsed)) {}
+ void Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+ const std::vector<OpReqType> &req, const std::vector<NDArray>
&outputs);
+
+ private:
+ bool initialized_{false};
+ DequantizeParam param_;
+ float cached_data_min_{0.f};
+ float cached_data_max_{0.f};
+ std::shared_ptr<mkldnn::memory> i_mem_;
+ std::shared_ptr<mkldnn::memory> o_mem_;
+ std::shared_ptr<mkldnn::reorder> fwd_pd_;
+};
+
+void SgMKLDNNDequantizeOperator::Forward(const OpContext &ctx, const
std::vector<NDArray> &inputs,
+ const std::vector<OpReqType> &req,
+ const std::vector<NDArray> &outputs) {
+ NDArray in_buffer = inputs[0];
+ if (inputs[0].IsView() && inputs[0].IsMKLDNNData()) in_buffer =
inputs[0].Reorder2Default();
auto i_mem = in_buffer.GetMKLDNNData();
- auto i_mpd = i_mem->get_primitive_desc();
- auto i_desc = i_mpd.desc();
- size_t i_ndim = in_buffer.shape().ndim();
- mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
- for (size_t i = 0; i < i_ndim; i++) {
- i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
- }
- mkldnn::memory::format i_fmt =
static_cast<mkldnn::memory::format>(i_desc.data.format);
- if (i_fmt == mkldnn::memory::format::nhwc) {
- // For 4d tensor, nchw is the default format
- i_fmt = mkldnn::memory::format::nchw;
+ float data_min = *inputs[1].data().dptr<float>();
+ float data_max = *inputs[2].data().dptr<float>();
+
+ if (initialized_ && (cached_data_min_ != data_min || cached_data_max_ !=
data_max))
+ initialized_ = false;
+
+ if (!initialized_) {
+ cached_data_min_ = data_min;
+ cached_data_max_ = data_max;
+ float real_range = MaxAbs(cached_data_min_, cached_data_max_);
+ float quantized_range = 0.0;
+ if (inputs[0].dtype() == mshadow::kUint8) {
+ quantized_range = kUint8Range;
+ } else if (inputs[0].dtype() == mshadow::kInt8) {
+ quantized_range = kInt8Range;
+ real_range = MaxAbs(*inputs[1].data().dptr<float>(),
*inputs[2].data().dptr<float>());
+ } else {
+ LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as
output type";
+ }
+ float scale = real_range / quantized_range;
+ primitive_attr attr;
+ const int mask = 0;
+ std::vector<float> scales = {scale};
+ attr.set_output_scales(mask, scales);
+ attr.set_int_output_round_mode(round_nearest);
+ mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+ auto i_mpd = i_mem->get_primitive_desc();
+ auto i_desc = i_mpd.desc();
+ size_t i_ndim = in_buffer.shape().ndim();
+ mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+ for (size_t i = 0; i < i_ndim; i++) {
+ i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+ }
+ mkldnn::memory::format o_fmt =
static_cast<mkldnn::memory::format>(i_desc.data.format);
+ if (o_fmt == mkldnn::memory::format::nhwc) {
+ // For 4d tensor, nchw is the default format
+ o_fmt = mkldnn::memory::format::nchw;
+ }
+ auto o_desc =
+ mkldnn::memory::desc(i_dims,
(mkldnn::memory::data_type)data_type_enum<float>::type, o_fmt);
+ auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+ auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
+ i_mem_ = std::make_shared<mkldnn::memory>(i_mpd, nullptr);
+ o_mem_ = std::make_shared<mkldnn::memory>(o_mpd, nullptr);
+ fwd_pd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *i_mem_, *o_mem_);
+ initialized_ = true;
}
- auto o_desc = mkldnn::memory::desc(i_dims,
-
(mkldnn::memory::data_type)data_type_enum<DstType>::type,
- i_fmt);
- auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
- auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
- auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
- MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem,
*o_mem.second));
+ auto o_mem = CreateMKLDNNMem(outputs[0], o_mem_->get_primitive_desc(),
req[0]);
+ i_mem_->set_data_handle(i_mem->get_data_handle());
+ o_mem_->set_data_handle(o_mem.second->get_data_handle());
+ MKLDNNStream::Get()->RegisterPrim(*fwd_pd_);
CommitOutput(outputs[0], o_mem);
MKLDNNStream::Get()->Submit();
}
-static void MKLDNNDequantizeCompute(const nnvm::NodeAttrs& attrs, const
OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
- if (inputs[0].dtype() == mshadow::kUint8) {
- MKLDNNDequantizeComputeKer<uint8_t, float>(inputs, outputs, req);
- } else if (inputs[0].dtype() == mshadow::kInt8) {
- MKLDNNDequantizeComputeKer<int8_t, float>(inputs, outputs, req);
- } else {
- LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as input
type";
- }
+static void SgMKLDNNDequantizeForward(const OpStatePtr &state_ptr, const
OpContext &ctx,
+ const std::vector<NDArray> &inputs,
+ const std::vector<OpReqType> &req,
+ const std::vector<NDArray> &outputs) {
+ SgMKLDNNDequantizeOperator &op =
state_ptr.get_state<SgMKLDNNDequantizeOperator>();
+ op.Forward(ctx, inputs, req, outputs);
}
+
+
} // namespace op
} // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index d6060e5..2da4158 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -34,99 +34,37 @@
namespace mxnet {
namespace op {
-template <typename SrcType, typename DstType>
-static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
- const std::vector<NDArray>& outputs,
- const QuantizeV2Param& param,
- const std::vector<OpReqType>& req) {
- using namespace mshadow;
- using namespace mxnet_op;
- using red::limits::MaxValue;
- using red::limits::MinValue;
- SrcType real_range = 0.f;
- DstType quantized_range = 0;
- NDArray in_buffer = inputs[0];
- SrcType data_min = red::limits::MaxValue<SrcType>();
- SrcType data_max = red::limits::MinValue<SrcType>();
- if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
- data_min = param.min_calib_range.value();
- data_max = param.max_calib_range.value();
- } else {
- // no calib info
- in_buffer = inputs[0].Reorder2Default();
- auto in_ptr = in_buffer.data().dptr<SrcType>();
- auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
- std::vector<SrcType> data_maxs(nthreads, data_max);
- std::vector<SrcType> data_mins(nthreads, data_min);
-#pragma omp parallel for num_threads(nthreads)
- for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size());
i++) {
- int tid = omp_get_thread_num();
- if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
- if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
- }
- for (index_t i = 0; i < nthreads; i++) {
- if (data_maxs[i] > data_max) data_max = data_maxs[i];
- if (data_mins[i] < data_min) data_min = data_mins[i];
- }
- }
+class SgMKLDNNQuantizeOperator {
+ public:
+ explicit SgMKLDNNQuantizeOperator(const nnvm::NodeAttrs &attrs)
+ : param_(nnvm::get<QuantizeV2Param>(attrs.parsed)) {}
- auto out_type = GetOutputType(param);
- if (out_type == mshadow::kUint8) {
- real_range = std::max<SrcType>(0.f, data_max);
- quantized_range = MaxValue<DstType>();
- *outputs[1].data().dptr<float>() = 0.f;
- *outputs[2].data().dptr<float>() = real_range;
- } else if (out_type == mshadow::kInt8) {
- real_range = MaxAbs(data_min, data_max);
- quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
- *outputs[1].data().dptr<float>() = -real_range;
- *outputs[2].data().dptr<float>() = real_range;
- } else {
- LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output
type";
- }
- float scale = static_cast<float>(quantized_range) / real_range;
+ void Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+ const std::vector<OpReqType> &req, const std::vector<NDArray>
&outputs);
- primitive_attr attr;
- const int mask = 0;
- std::vector<float> scales = {scale};
- attr.set_output_scales(mask, scales);
- attr.set_int_output_round_mode(round_nearest);
- mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+ private:
+ bool initalized_{false};
+ QuantizeV2Param param_;
+ float cached_data_min_{0.f};
+ float cached_data_max_{0.f};
+ std::shared_ptr<mkldnn::memory> i_mem_;
+ std::shared_ptr<mkldnn::memory> o_mem_;
+ std::shared_ptr<mkldnn::reorder> fwd_pd_;
+};
- if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer =
inputs[0].Reorder2Default();
- auto i_mem = in_buffer.GetMKLDNNData();
- auto i_mpd = i_mem->get_primitive_desc();
- auto i_desc = i_mpd.desc();
- mkldnn::memory::format i_fmt =
static_cast<mkldnn::memory::format>(i_desc.data.format);
- if (i_fmt == mkldnn::memory::format::nchw ||
- i_fmt == mkldnn::memory::format::nChw8c ||
- i_fmt == mkldnn_nChw16c) {
- i_fmt = mkldnn::memory::format::nhwc;
- }
- size_t i_ndim = in_buffer.shape().ndim();
- mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
- for (size_t i = 0; i < i_ndim; i++) {
- i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
- }
- auto o_desc =
- mkldnn::memory::desc(i_dims,
(mkldnn::memory::data_type)data_type_enum<DstType>::type, i_fmt);
- auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
- auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
- auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
- MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem,
*o_mem.second));
- CommitOutput(outputs[0], o_mem);
- MKLDNNStream::Get()->Submit();
-}
+void SgMKLDNNQuantizeOperator::Forward(const OpContext &ctx, const
std::vector<NDArray> &inputs,
+ const std::vector<OpReqType> &req,
+ const std::vector<NDArray> &outputs) {
+ float quantized_range = 0.0;
+ NDArray in_buffer = inputs[0];
+ float data_min = mshadow::red::limits::MaxValue<float>();
+ float data_max = mshadow::red::limits::MinValue<float>();
-static void MKLDNNQuantizeV2Compute(const nnvm::NodeAttrs& attrs, const
OpContext& ctx,
- const std::vector<NDArray>& inputs,
- const std::vector<OpReqType>& req,
- const std::vector<NDArray>& outputs) {
- const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+ // Pass through quantized data
if (inputs[0].dtype() == mshadow::kUint8 || inputs[0].dtype() ==
mshadow::kInt8) {
- if (param.min_calib_range.has_value() &&
param.max_calib_range.has_value()) {
- *outputs[1].data().dptr<float>() = param.min_calib_range.value();
- *outputs[2].data().dptr<float>() = param.max_calib_range.value();
+ if (param_.min_calib_range.has_value() &&
param_.max_calib_range.has_value()) {
+ *outputs[1].data().dptr<float>() = param_.min_calib_range.value();
+ *outputs[2].data().dptr<float>() = param_.max_calib_range.value();
} else {
if (inputs[0].dtype() == mshadow::kUint8) {
*outputs[1].data().dptr<float>() = 0;
@@ -137,21 +75,101 @@ static void MKLDNNQuantizeV2Compute(const
nnvm::NodeAttrs& attrs, const OpContex
}
}
if (req[0] != kWriteInplace) {
- const_cast<NDArray&>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
+ const_cast<NDArray &>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
MKLDNNStream::Get()->Submit();
}
} else {
- auto out_type = GetOutputType(param);
+ if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer =
inputs[0].Reorder2Default();
+ auto i_mem = in_buffer.GetMKLDNNData();
+
+ if (param_.min_calib_range.has_value() &&
param_.max_calib_range.has_value()) {
+ data_min = param_.min_calib_range.value();
+ data_max = param_.max_calib_range.value();
+ } else {
+ // no calib info
+ in_buffer = inputs[0].Reorder2Default();
+ auto in_ptr = in_buffer.data().dptr<float>();
+ auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+ std::vector<float> data_maxs(nthreads, data_max);
+ std::vector<float> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+ for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size());
i++) {
+ int tid = omp_get_thread_num();
+ if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+ if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+ }
+ for (index_t i = 0; i < nthreads; i++) {
+ if (data_maxs[i] > data_max) data_max = data_maxs[i];
+ if (data_mins[i] < data_min) data_min = data_mins[i];
+ }
+
+ if (initalized_ && (cached_data_min_ != data_min || cached_data_max_ !=
data_max))
+ initalized_ = false;
+ }
+
+ // Write output min/max
+ auto out_type = GetOutputType(param_);
if (out_type == mshadow::kUint8) {
- MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+ quantized_range = kUint8Range;
+ *outputs[1].data().dptr<float>() = data_min;
+ *outputs[2].data().dptr<float>() = data_max;
} else if (out_type == mshadow::kInt8) {
- MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+ float real_range = MaxAbs(data_min, data_max);
+ quantized_range = kInt8Range;
+ *outputs[1].data().dptr<float>() = -real_range;
+ *outputs[2].data().dptr<float>() = real_range;
} else {
LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output
type";
}
+
+ if (!initalized_) {
+ cached_data_min_ = data_min;
+ cached_data_max_ = data_max;
+ float real_range = MaxAbs(data_min, data_max);
+ float scale = quantized_range / real_range;
+ primitive_attr attr;
+ const int mask = 0;
+ std::vector<float> scales = {scale};
+ attr.set_output_scales(mask, scales);
+ attr.set_int_output_round_mode(round_nearest);
+ mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+ auto i_mpd = i_mem->get_primitive_desc();
+ auto i_desc = i_mpd.desc();
+ mkldnn::memory::format i_fmt =
static_cast<mkldnn::memory::format>(i_desc.data.format);
+ if (i_fmt == mkldnn::memory::format::nchw || i_fmt ==
mkldnn::memory::format::nChw8c ||
+ i_fmt == mkldnn_nChw16c) {
+ i_fmt = mkldnn::memory::format::nhwc;
+ }
+ size_t i_ndim = in_buffer.shape().ndim();
+ mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+ for (size_t i = 0; i < i_ndim; i++) {
+ i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+ }
+ auto o_desc = mkldnn::memory::desc(i_dims, get_mkldnn_type(out_type),
i_fmt);
+ auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+ auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
+ i_mem_ = std::make_shared<mkldnn::memory>(i_mpd, nullptr);
+ o_mem_ = std::make_shared<mkldnn::memory>(o_mpd, nullptr);
+ fwd_pd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *i_mem_,
*o_mem_);
+ initalized_ = true;
+ }
+ auto o_mem = CreateMKLDNNMem(outputs[0], o_mem_->get_primitive_desc(),
req[0]);
+ i_mem_->set_data_handle(i_mem->get_data_handle());
+ o_mem_->set_data_handle(o_mem.second->get_data_handle());
+ MKLDNNStream::Get()->RegisterPrim(*fwd_pd_);
+ CommitOutput(outputs[0], o_mem);
+ MKLDNNStream::Get()->Submit();
}
}
+static void SgMKLDNNQuantizeForward(const OpStatePtr &state_ptr, const
OpContext &ctx,
+ const std::vector<NDArray> &inputs,
+ const std::vector<OpReqType> &req,
+ const std::vector<NDArray> &outputs) {
+ SgMKLDNNQuantizeOperator &op =
state_ptr.get_state<SgMKLDNNQuantizeOperator>();
+ op.Forward(ctx, inputs, req, outputs);
+}
+
} // namespace op
} // namespace mxnet
diff --git a/src/operator/quantization/quantize_v2-inl.h
b/src/operator/quantization/quantize_v2-inl.h
index 9ebb645..2054075 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -125,95 +125,14 @@ struct quantize_v2_zero_centered {
}
};
-template <typename xpu>
-void QuantizeV2Compute(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<TBlob> &inputs, const
std::vector<OpReqType> &req,
- const std::vector<TBlob> &outputs) {
- using namespace mshadow;
- using namespace mxnet_op;
- typedef float SrcDType;
- using mshadow::red::limits::MaxValue;
- using mshadow::red::limits::MinValue;
- Stream<xpu> *s = ctx.get_stream<xpu>();
- const QuantizeV2Param ¶m = nnvm::get<QuantizeV2Param>(attrs.parsed);
- auto out_type = GetOutputType(param);
- if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
- LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
- "please switch to the context of CPU or int8 data type for
GPU.";
- }
-
- if (inputs[0].type_flag_ == mshadow::kUint8 || inputs[0].type_flag_ ==
mshadow::kInt8) {
- if (param.min_calib_range.has_value() &&
param.max_calib_range.has_value()) {
- *outputs[1].dptr<float>() = param.min_calib_range.value();
- *outputs[2].dptr<float>() = param.max_calib_range.value();
- } else {
- if (inputs[0].type_flag_ == mshadow::kUint8) {
- *outputs[1].dptr<float>() = 0;
- *outputs[2].dptr<float>() = 255;
- } else {
- *outputs[1].dptr<float>() = -127;
- *outputs[2].dptr<float>() = 127;
- }
- }
- UnaryOp::IdentityCompute<xpu>(attrs, ctx, {inputs[0]}, req, outputs);
- } else {
- if (param.min_calib_range.has_value() &&
param.max_calib_range.has_value()) {
- if (out_type == mshadow::kUint8) {
- Kernel<quantize_v2_unsigned, xpu>::Launch(
- s, outputs[0].Size(), outputs[0].dptr<uint8_t>(),
outputs[1].dptr<float>(),
- outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(),
param.min_calib_range.value(),
- param.max_calib_range.value(), MinValue<uint8_t>(),
MaxValue<uint8_t>());
- } else if (out_type == mshadow::kInt8) { // zero-centered quantization
- Kernel<quantize_v2_zero_centered, xpu>::Launch(
- s, outputs[0].Size(), outputs[0].dptr<int8_t>(),
outputs[1].dptr<float>(),
- outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(),
param.min_calib_range.value(),
- param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(),
MinValue<int8_t>()));
- } else {
- LOG(FATAL) << "quantize op only supports int8 and uint8 as output
type";
- }
- } else { // model is not calibrated
- mxnet::TShape src_shape, dst_shape;
- const size_t actual_float_size = sizeof(float);
- const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
- s, inputs[0].shape_, mxnet::TShape(1, 1), &src_shape, &dst_shape);
- Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu,
1, char>(
- Shape1(2 * actual_float_size + temp_reduce_size), s);
- const int dev_id = ctx.run_ctx.ctx.dev_id;
- TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_),
Shape1(1), xpu::kDevMask,
- dev_id);
- TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1,
Shape1(1), xpu::kDevMask,
- dev_id);
- Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
- Shape1(temp_reduce_size), s);
- broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
- s, in_min_t.reshape(dst_shape), kWriteTo, workspace,
inputs[0].reshape(src_shape));
- broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
- s, in_max_t.reshape(dst_shape), kWriteTo, workspace,
inputs[0].reshape(src_shape));
- if (out_type == mshadow::kUint8) {
- Kernel<quantize_v2_unsigned, xpu>::Launch(
- s, outputs[0].Size(), outputs[0].dptr<uint8_t>(),
outputs[1].dptr<float>(),
- outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(),
in_min_t.dptr<float>(),
- in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
- } else if (out_type == mshadow::kInt8) { // zero-centered quantization
- Kernel<quantize_v2_zero_centered, xpu>::Launch(
- s, outputs[0].Size(), outputs[0].dptr<int8_t>(),
outputs[1].dptr<float>(),
- outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(),
in_min_t.dptr<float>(),
- in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(),
MinValue<int8_t>()));
- } else {
- LOG(FATAL) << "quantize op only supports int8 and uint8 as output
type";
- }
- }
- }
-}
-
-static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs,
mxnet::ShapeVector *in_attrs,
- mxnet::ShapeVector *out_attrs) {
+static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs,
std::vector<TShape> *in_attrs,
+ std::vector<TShape> *out_attrs) {
CHECK_EQ(in_attrs->size(), 1U);
CHECK_EQ(out_attrs->size(), 3U);
SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
- SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape{1});
- SHAPE_ASSIGN_CHECK(*out_attrs, 2, mxnet::TShape{1});
+ SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape{1});
+ SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape{1});
return !shape_is_none(out_attrs->at(0));
}
@@ -237,6 +156,102 @@ static inline bool QuantizeV2Type(const nnvm::NodeAttrs
&attrs, std::vector<int>
return (*in_attrs)[0] != -1;
}
+template<typename xpu>
+class QuantizeV2Operator {
+ public:
+ explicit QuantizeV2Operator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
+
+ void Forward(const OpContext &ctx, const std::vector<TBlob> &inputs,
+ const std::vector<OpReqType> &req, const std::vector<TBlob>
&outputs) {
+ using namespace mshadow;
+ using namespace mxnet_op;
+ typedef float SrcDType;
+ using mshadow::red::limits::MaxValue;
+ using mshadow::red::limits::MinValue;
+ Stream<xpu> *s = ctx.get_stream<xpu>();
+ const QuantizeV2Param ¶m = nnvm::get<QuantizeV2Param>(attrs_.parsed);
+ auto out_type = GetOutputType(param);
+ if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
+ LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
+ "please switch to the context of CPU or int8 data type for
GPU.";
+ }
+
+ if (inputs[0].type_flag_ == mshadow::kUint8 || inputs[0].type_flag_ ==
mshadow::kInt8) {
+ if (param.min_calib_range.has_value() &&
param.max_calib_range.has_value()) {
+ *outputs[1].dptr<float>() = param.min_calib_range.value();
+ *outputs[2].dptr<float>() = param.max_calib_range.value();
+ } else {
+ if (inputs[0].type_flag_ == mshadow::kUint8) {
+ *outputs[1].dptr<float>() = 0;
+ *outputs[2].dptr<float>() = 255;
+ } else {
+ *outputs[1].dptr<float>() = -127;
+ *outputs[2].dptr<float>() = 127;
+ }
+ }
+ UnaryOp::IdentityCompute<xpu>(attrs_, ctx, {inputs[0]}, req, outputs);
+ } else {
+ if (param.min_calib_range.has_value() &&
param.max_calib_range.has_value()) {
+ if (out_type == mshadow::kUint8) {
+ Kernel<quantize_v2_unsigned, xpu>::Launch(
+ s, outputs[0].Size(), outputs[0].dptr<uint8_t>(),
outputs[1].dptr<float>(),
+ outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(),
param.min_calib_range.value(),
+ param.max_calib_range.value(), MinValue<uint8_t>(),
MaxValue<uint8_t>());
+ } else if (out_type == mshadow::kInt8) { // zero-centered quantization
+ Kernel<quantize_v2_zero_centered, xpu>::Launch(
+ s, outputs[0].Size(), outputs[0].dptr<int8_t>(),
outputs[1].dptr<float>(),
+ outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(),
param.min_calib_range.value(),
+ param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(),
MinValue<int8_t>()));
+ } else {
+ LOG(FATAL) << "quantize op only supports int8 and uint8 as output
type";
+ }
+ } else { // model is not calibrated
+ mxnet::TShape src_shape, dst_shape;
+ const size_t actual_float_size = sizeof(float);
+ const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
+ s, inputs[0].shape_, mxnet::TShape(1, 1), &src_shape, &dst_shape);
+ Tensor<xpu, 1, char> temp_space =
ctx.requested[0].get_space_typed<xpu, 1, char>(
+ Shape1(2 * actual_float_size + temp_reduce_size), s);
+ const int dev_id = ctx.run_ctx.ctx.dev_id;
+ TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_),
Shape1(1), xpu::kDevMask,
+ dev_id);
+ TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1,
Shape1(1), xpu::kDevMask,
+ dev_id);
+ Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 *
actual_float_size,
+ Shape1(temp_reduce_size), s);
+ broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+ s, in_min_t.reshape(dst_shape), kWriteTo, workspace,
inputs[0].reshape(src_shape));
+ broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+ s, in_max_t.reshape(dst_shape), kWriteTo, workspace,
inputs[0].reshape(src_shape));
+ if (out_type == mshadow::kUint8) {
+ Kernel<quantize_v2_unsigned, xpu>::Launch(
+ s, outputs[0].Size(), outputs[0].dptr<uint8_t>(),
outputs[1].dptr<float>(),
+ outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(),
in_min_t.dptr<float>(),
+ in_max_t.dptr<float>(), MinValue<uint8_t>(),
MaxValue<uint8_t>());
+ } else if (out_type == mshadow::kInt8) { // zero-centered quantization
+ Kernel<quantize_v2_zero_centered, xpu>::Launch(
+ s, outputs[0].Size(), outputs[0].dptr<int8_t>(),
outputs[1].dptr<float>(),
+ outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(),
in_min_t.dptr<float>(),
+ in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(),
MinValue<int8_t>()));
+ } else {
+ LOG(FATAL) << "quantize op only supports int8 and uint8 as output
type";
+ }
+ }
+ }
+ }
+
+ private:
+ nnvm::NodeAttrs attrs_;
+};
+
+template <typename xpu>
+static void QuantizeV2Forward(const OpStatePtr &state_ptr, const OpContext
&ctx,
+ const std::vector<TBlob> &inputs, const
std::vector<OpReqType> &req,
+ const std::vector<TBlob> &outputs) {
+ auto &op = state_ptr.get_state<QuantizeV2Operator<xpu>>();
+ op.Forward(ctx, inputs, req, outputs);
+}
+
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
diff --git a/src/operator/quantization/quantize_v2.cc
b/src/operator/quantization/quantize_v2.cc
index 920100b..e9017a5 100644
--- a/src/operator/quantization/quantize_v2.cc
+++ b/src/operator/quantization/quantize_v2.cc
@@ -47,6 +47,22 @@ static bool QuantizeV2StorageType(const nnvm::NodeAttrs&
attrs, const int dev_ma
return true;
}
+static OpStatePtr CreateQuantizeV2State(const nnvm::NodeAttrs& attrs, Context
ctx,
+ const std::vector<TShape>& in_shapes,
+ const std::vector<int>& in_types) {
+ OpStatePtr state;
+ if (ctx.dev_type == kGPU) {
+ state = OpStatePtr::Create<QuantizeV2Operator<gpu>>(attrs);
+ } else {
+#if MXNET_USE_MKLDNN == 1
+ state = OpStatePtr::Create<SgMKLDNNQuantizeOperator>(attrs);
+#else
+ state = OpStatePtr::Create<QuantizeV2Operator<cpu>>(attrs);
+#endif
+ }
+ return state;
+}
+
NNVM_REGISTER_OP(_contrib_quantize_v2)
.describe(R"code(Quantize a input tensor from float to `out_type`,
with user-specified `min_calib_range` and `max_calib_range` or the input range
collected at runtime.
@@ -86,11 +102,12 @@ If min_calib_range isn't presented, the output type will
be int8.
// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
// will be reverted after the improvement of CachedOP is done.
.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FCreateOpState>("FCreateOpState", CreateQuantizeV2State)
#if MXNET_USE_MKLDNN == 1
.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeV2Compute)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>",
SgMKLDNNQuantizeForward)
#endif
-.set_attr<FCompute>("FCompute<cpu>", QuantizeV2Compute<cpu>)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", QuantizeV2Forward<cpu>)
.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
return std::vector<std::pair<int, int> >{{0, 0}};
})
diff --git a/src/operator/quantization/quantize_v2.cu
b/src/operator/quantization/quantize_v2.cu
index ab0cf9c..7acdf56 100644
--- a/src/operator/quantization/quantize_v2.cu
+++ b/src/operator/quantization/quantize_v2.cu
@@ -28,7 +28,7 @@ namespace mxnet {
namespace op {
NNVM_REGISTER_OP(_contrib_quantize_v2)
-.set_attr<FCompute>("FCompute<gpu>", QuantizeV2Compute<gpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", QuantizeV2Forward<gpu>);
} // namespace op
} // namespace mxnet
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index d61b461..e142fae 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -175,7 +175,7 @@ class SgMKLDNNConvOperator {
const std::vector<NDArray> &outputs);
private:
- bool initalized_{false};
+ bool initialized_{false};
bool inplace_{false};
bool post_requantize_{false};
nnvm::Symbol subgraph_sym_;
@@ -235,7 +235,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
// Copy inputs[in_sum] into outputs[kOut] in case inplace optimization
failed.
if (mkldnn_param.with_sum) {
- if (!initalized_) {
+ if (!initialized_) {
// TODO(zhennan): Currently, mkldnn fallback mechanism will break
inplace option,
// which make check (req[kOut] == kWriteInplace) useless.
auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
@@ -257,23 +257,23 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
// Check input change
// TODO(zhennan): Only update cached_* changed.
- if (initalized_) {
+ if (initialized_) {
if (mkldnn_param.with_bn) {
if (weight_ver_ != inputs[in_weight].version() ||
((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
- initalized_ = false;
+ initialized_ = false;
}
}
- if (initalized_ && mkldnn_param.quantized) {
+ if (initialized_ && mkldnn_param.quantized) {
if (cached_data_min_ != data_min || cached_data_max_ != data_max ||
cached_sum_min_ != sum_min || cached_sum_max_ != sum_max ||
weight_ver_ != inputs[in_weight].version() ||
((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
- initalized_ = false;
+ initialized_ = false;
}
}
}
- if (!initalized_) {
+ if (!initialized_) {
cached_data_min_ = data_min;
cached_data_max_ = data_max;
cached_sum_min_ = sum_min;
@@ -353,7 +353,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
fwd_->SetNewMem(*data.GetMKLDNNData(), *cached_weight_.GetMKLDNNData(),
has_bias ? cached_bias_.GetMKLDNNData() : nullptr,
*output.GetMKLDNNData());
- initalized_ = true;
+ initialized_ = true;
}
if (mkldnn_param.quantized) {