This is an automated email from the ASF dual-hosted git repository.
patriczhao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 6c60025 [MKLDNN]Refactor requantize to speed up execution (#14608)
6c60025 is described below
commit 6c600250f2a8516443c7909d156cbb522ad9c395
Author: Zhennan Qin <[email protected]>
AuthorDate: Sun Apr 28 09:19:37 2019 +0800
[MKLDNN]Refactor requantize to speed up execution (#14608)
* Refactor requantize
* fix ci
* Fix CI
* Fix ci
---
.../quantization/mkldnn/mkldnn_quantize_v2-inl.h | 2 +-
.../quantization/mkldnn/mkldnn_quantized_conv.cc | 2 +-
.../mkldnn/mkldnn_quantized_fully_connected.cc | 2 +-
.../quantization/mkldnn/mkldnn_requantize-inl.h | 91 +++++++++++-----------
src/operator/quantization/quantization_utils.h | 78 +++++++++++++------
src/operator/quantization/quantize_graph_pass.cc | 3 +-
src/operator/quantization/quantize_v2-inl.h | 33 ++------
src/operator/quantization/quantized_conv.cu | 2 +-
.../quantization/quantized_fully_connected.cc | 2 +-
.../quantization/quantized_fully_connected.cu | 2 +-
src/operator/quantization/requantize-inl.h | 23 +++++-
src/operator/subgraph/mkldnn/mkldnn_conv.cc | 48 +++++++++---
src/operator/subgraph/mkldnn/mkldnn_fc.cc | 2 +-
tests/python/quantization/test_quantization.py | 28 +++----
14 files changed, 189 insertions(+), 129 deletions(-)
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index 2da4158..bd1b47e 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -108,7 +108,7 @@ void SgMKLDNNQuantizeOperator::Forward(const OpContext
&ctx, const std::vector<N
}
// Write output min/max
- auto out_type = GetOutputType(param_);
+ auto out_type = GetQuantizeOutputType(param_);
if (out_type == mshadow::kUint8) {
quantized_range = kUint8Range;
*outputs[1].data().dptr<float>() = data_min;
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
index b8c47c3..55028d8 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
@@ -72,7 +72,7 @@ static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs&
attrs,
MKLDNNStream::Get()->Submit();
Stream<cpu> *s = ctx.get_stream<cpu>();
const size_t num_inputs = param.no_bias ? 2 : 3;
- mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+ mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct,
cpu>::Launch(s, 1,
out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
in_data[num_inputs].data().dptr<float>(),
in_data[num_inputs+1].data().dptr<float>(),
diff --git
a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
index cf3d789..e8abab2 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
@@ -80,7 +80,7 @@ void MKLDNNQuantizedFullyConnectedForward(const
nnvm::NodeAttrs &attrs,
}
Stream<cpu> *s = ctx.get_stream<cpu>();
- mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+ mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct,
cpu>::Launch(s, 1,
min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight,
&max_weight);
bool is_train = false;
diff --git a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
index ac414c7..03d9b90 100644
--- a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
@@ -34,6 +34,7 @@
namespace mxnet {
namespace op {
+template <typename DstType>
static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<NDArray>& inputs,
@@ -45,7 +46,6 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs&
attrs,
using red::limits::MaxValue;
using red::limits::MinValue;
typedef int32_t SrcDType;
- typedef int8_t DstDType;
// check shapes
size_t i_dim = inputs[0].shape().ndim();
size_t o_dim = outputs[0].shape().ndim();
@@ -56,12 +56,21 @@ static void MKLDNNRequantizeForwardKer(const
nnvm::NodeAttrs& attrs,
*inputs[2].data().dptr<float>());
float first_scale = first_real_range / first_quantized_range;
float second_real_range = real_range;
- float second_quantized_range = MinAbs(MaxValue<DstDType>(),
- MinValue<DstDType>());
+ float second_quantized_range = 0.f;
+ if (std::is_same<DstType, int8_t>::value) {
+ second_quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
+ *outputs[1].data().dptr<float>() = -second_real_range;
+ *outputs[2].data().dptr<float>() = second_real_range;
+ } else if (std::is_same<DstType, uint8_t>::value) {
+ second_quantized_range = MaxValue<DstType>();
+ *outputs[1].data().dptr<float>() = 0.f;
+ *outputs[2].data().dptr<float>() = second_real_range;
+ } else {
+ LOG(FATAL) << "Unsupported requantize output type";
+ }
float second_scale = second_quantized_range / second_real_range;
float scale = first_scale * second_scale;
- *outputs[1].data().dptr<float>() = -second_real_range;
- *outputs[2].data().dptr<float>() = second_real_range;
+
primitive_attr attr;
const int mask = 0;
std::vector<float> scales = {scale};
@@ -82,7 +91,7 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs&
attrs,
i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
}
auto o_desc = mkldnn::memory::desc(i_dims,
-
(mkldnn::memory::data_type)data_type_enum<DstDType>::type,
+
(mkldnn::memory::data_type)data_type_enum<DstType>::type,
i_fmt);
auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
@@ -99,55 +108,47 @@ static void MKLDNNRequantizeForward(const nnvm::NodeAttrs&
attrs,
const std::vector<NDArray>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
+ using red::limits::MaxValue;
+ using red::limits::MinValue;
typedef int32_t SrcDType;
typedef int8_t DstDType;
- Stream<cpu> *s = ctx.get_stream<cpu>();
const RequantizeParam& param = nnvm::get<RequantizeParam>(attrs.parsed);
float real_range;
// Model is calibrated
if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
real_range =
MaxAbs(param.min_calib_range.value(), param.max_calib_range.value());
- MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
// Model is not calibrated
} else {
- mxnet::TShape src_shape, dst_shape;
- const size_t actual_float_size = sizeof(float);
- const size_t actual_quantized_size = sizeof(SrcDType);
- const size_t temp_reduce_size = ConfigReduce<cpu, SrcDType>(s,
- inputs[0].shape(), mxnet::TShape(1, 1), &src_shape,
&dst_shape);
- Tensor<cpu, 1, char> temp_space =
- ctx.requested[0].get_space_typed<cpu, 1, char>(
- Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
- Tensor<cpu, 1, float> actual_min_float(
- reinterpret_cast<float*>(temp_space.dptr_), Shape1(1), s);
- Tensor<cpu, 1, float> actual_max_float(
- reinterpret_cast<float*>(temp_space.dptr_) + 1, Shape1(1), s);
- const int dev_id = ctx.run_ctx.ctx.dev_id;
- TBlob actual_min_quantized(reinterpret_cast<SrcDType*>(
- temp_space.dptr_ + 8), Shape1(1), cpu::kDevMask,
dev_id);
- TBlob actual_max_quantized(reinterpret_cast<SrcDType*>(
- temp_space.dptr_ + 8) + 1, Shape1(1), cpu::kDevMask,
dev_id);
- Tensor<cpu, 1, char> workspace(
- temp_space.dptr_+2*actual_float_size+2*actual_quantized_size,
- Shape1(temp_reduce_size), s);
- broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
- s, actual_min_quantized.reshape(dst_shape), kWriteTo,
- workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
- Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
- actual_min_float.dptr_, actual_min_quantized.dptr<SrcDType>(),
- inputs[1].Reorder2Default().data().dptr<float>(),
- inputs[2].Reorder2Default().data().dptr<float>());
- broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
- s, actual_max_quantized.reshape(dst_shape), kWriteTo,
- workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
- Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
- actual_max_float.dptr_, actual_max_quantized.dptr<SrcDType>(),
- inputs[1].Reorder2Default().data().dptr<float>(),
- inputs[2].Reorder2Default().data().dptr<float>());
-
- real_range = MaxAbs(*actual_min_float.dptr_, *actual_max_float.dptr_);
- MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
+ NDArray in_buffer = inputs[0].Reorder2Default();
+ auto in_ptr = in_buffer.data().dptr<SrcDType>();
+ auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+ SrcDType data_min = MaxValue<SrcDType>();
+ SrcDType data_max = MinValue<SrcDType>();
+ std::vector<SrcDType> data_maxs(nthreads, data_max);
+ std::vector<SrcDType> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+ for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size());
i++) {
+ int tid = omp_get_thread_num();
+ if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+ if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+ }
+ for (index_t i = 0; i < nthreads; i++) {
+ if (data_maxs[i] > data_max) data_max = data_maxs[i];
+ if (data_mins[i] < data_min) data_min = data_mins[i];
+ }
+ float src_range = MinAbs(MinValue<SrcDType>(), MaxValue<SrcDType>());
+ SrcDType data_range = MaxAbs(data_min, data_max);
+ float data_scale = MaxAbs(*inputs[1].data().dptr<float>(),
*inputs[2].data().dptr<float>());
+ real_range = data_range * data_scale / src_range;
+ }
+ auto out_type = GetQuantizeOutputType(param);
+ if (out_type == mshadow::kUint8) {
+ MKLDNNRequantizeForwardKer<uint8_t>(attrs, ctx, inputs, req, outputs,
real_range);
+ } else if (out_type == mshadow::kInt8) {
+ MKLDNNRequantizeForwardKer<int8_t>(attrs, ctx, inputs, req, outputs,
real_range);
+ } else {
+ LOG(FATAL) << "mkldnn requantize op only supports int8 and uint8 as output
type";
}
}
diff --git a/src/operator/quantization/quantization_utils.h
b/src/operator/quantization/quantization_utils.h
index c540ea4..86018e6 100644
--- a/src/operator/quantization/quantization_utils.h
+++ b/src/operator/quantization/quantization_utils.h
@@ -127,39 +127,31 @@ MSHADOW_XINLINE void RequantizeManyInNewRange(size_t
count, T2* output, const T1
* \brief Get the scaling factor for converting type T to float.
*/
template<typename T>
-MSHADOW_XINLINE float FloatForOneQuantizedLevel(float range_min, float
range_max) {
+MSHADOW_XINLINE float FloatForOneQuantizedLevel(float range_min, float
range_max, bool all_sign) {
using mshadow::red::limits::MinValue;
using mshadow::red::limits::MaxValue;
- const int64_t highest = static_cast<int64_t>(MaxValue<T>());
- const int64_t lowest = static_cast<int64_t>(MinValue<T>());
- const float float_for_one_quantized_level =
- (range_max - range_min) / (highest - lowest);
- return float_for_one_quantized_level;
+ float range_data = MaxAbs(range_min, range_max);
+ float range_T = all_sign ? MinAbs(MinValue<T>(), MaxValue<T>()) :
MaxValue<T>();
+ return range_data / range_T;
}
template <typename TA, typename TB, typename TC>
-MSHADOW_XINLINE void QuantizationRangeForMultiplication(float min_a, float
max_a,
- float min_b, float
max_b,
- float* min_c, float*
max_c) {
- using mshadow::red::limits::MinValue;
+MSHADOW_XINLINE void QuantizationRangeForMultiplication(float min_a, float
max_a, float min_b,
+ float max_b, float
*min_c, float *max_c,
+ bool all_sign) {
using mshadow::red::limits::MaxValue;
- const float a_float_for_one_quant_level =
- FloatForOneQuantizedLevel<TA>(min_a, max_a);
- const float b_float_for_one_quant_level =
- FloatForOneQuantizedLevel<TB>(min_b, max_b);
-
- const int64_t c_highest =
- static_cast<int64_t>(MaxValue<TC>());
- const int64_t c_lowest =
- static_cast<int64_t>(MinValue<TC>());
+ using mshadow::red::limits::MinValue;
+ const float a_float_for_one_quant_level =
FloatForOneQuantizedLevel<TA>(min_a, max_a, all_sign);
+ const float b_float_for_one_quant_level =
FloatForOneQuantizedLevel<TB>(min_b, max_b, all_sign);
+ const float range_c =
+ MinAbs(static_cast<int64_t>(MinValue<TC>()),
static_cast<int64_t>(MaxValue<TC>()));
const float c_float_for_one_quant_level =
- a_float_for_one_quant_level * b_float_for_one_quant_level;
-
- *min_c = c_float_for_one_quant_level * c_lowest;
- *max_c = c_float_for_one_quant_level * c_highest;
+ a_float_for_one_quant_level * b_float_for_one_quant_level;
+ *max_c = c_float_for_one_quant_level * range_c;
+ *min_c = -*max_c;
}
-struct QuantizationRangeForMultiplicationStruct {
+struct QuantizationRangeForS8S8MultiplicationStruct {
MSHADOW_XINLINE static void Map(int i,
float *min_c,
float *max_c,
@@ -168,7 +160,20 @@ struct QuantizationRangeForMultiplicationStruct {
const float *min_b,
const float *max_b) {
QuantizationRangeForMultiplication<int8_t, int8_t, int32_t>(
- min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c);
+ min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c, true);
+ }
+};
+
+struct QuantizationRangeForS8U8MultiplicationStruct {
+ MSHADOW_XINLINE static void Map(int i,
+ float *min_c,
+ float *max_c,
+ const float *min_a,
+ const float *max_a,
+ const float *min_b,
+ const float *max_b) {
+ QuantizationRangeForMultiplication<int8_t, uint8_t, int32_t>(
+ min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c, false);
}
};
@@ -186,6 +191,29 @@ inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo,
*src_shape);
}
+enum QuantizeOutType { kAuto = 0, kInt8, kUint8 };
+
+template<typename Param>
+static mshadow::TypeFlag GetQuantizeOutputType(const Param ¶m) {
+ auto out_type = mshadow::kInt8;
+ if (param.out_type == QuantizeOutType::kAuto) {
+ if (param.min_calib_range.has_value() &&
param.max_calib_range.has_value()) {
+ if (param.min_calib_range.value() >= 0.0) {
+ out_type = mshadow::kUint8;
+ } else {
+ out_type = mshadow::kInt8;
+ }
+ }
+ } else if (param.out_type == QuantizeOutType::kInt8) {
+ out_type = mshadow::kInt8;
+ } else if (param.out_type == QuantizeOutType::kUint8) {
+ out_type = mshadow::kUint8;
+ } else {
+ LOG(FATAL) << "Unsupported out_type in params: " <<param.out_type;
+ }
+ return out_type;
+}
+
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_QUANTIZATION_QUANTIZATION_UTILS_H_
diff --git a/src/operator/quantization/quantize_graph_pass.cc
b/src/operator/quantization/quantize_graph_pass.cc
index 7ff2999..7591477 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -248,6 +248,7 @@ Graph QuantizeGraph(Graph &&src) {
NodePtr requantize_node = Node::Create();
requantize_node->attrs.op = Op::Get("_contrib_requantize");
requantize_node->attrs.name = "requantize_" + node->attrs.name;
+ requantize_node->attrs.dict["out_type"] = quantized_dtype;
if (requantize_node->op()->attr_parser != nullptr) {
requantize_node->op()->attr_parser(&(requantize_node->attrs));
}
@@ -398,7 +399,7 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
node->attrs.dict["max_calib_range"] =
std::to_string(calib_table_iter->second.second);
node->op()->attr_parser(&(node->attrs));
const QuantizeV2Param& param =
nnvm::get<QuantizeV2Param>(node->attrs.parsed);
- if (param.out_type == QuantizeV2Param::OutType::kUint8 &&
+ if (param.out_type == QuantizeOutType::kUint8 &&
param.min_calib_range.value() < 0.0f) {
LOG(WARNING) << "Calibration statistics indicates that node `" <<
node->attrs.name
<< "` has negative input, consider use `auto` or `int8`
as out_type";
diff --git a/src/operator/quantization/quantize_v2-inl.h
b/src/operator/quantization/quantize_v2-inl.h
index 2054075..a8cbc0b 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -38,16 +38,15 @@ namespace mxnet {
namespace op {
struct QuantizeV2Param : public dmlc::Parameter<QuantizeV2Param> {
- enum OutType { kAuto = 0, kInt8, kUint8 };
int out_type;
dmlc::optional<float> min_calib_range;
dmlc::optional<float> max_calib_range;
DMLC_DECLARE_PARAMETER(QuantizeV2Param) {
DMLC_DECLARE_FIELD(out_type)
- .add_enum("auto", kAuto)
- .add_enum("int8", kInt8)
- .add_enum("uint8", kUint8)
- .set_default(kInt8)
+ .add_enum("auto", QuantizeOutType::kAuto)
+ .add_enum("int8", QuantizeOutType::kInt8)
+ .add_enum("uint8", QuantizeOutType::kUint8)
+ .set_default(QuantizeOutType::kInt8)
.describe("Output data type. `auto` can be specified to automatically
determine output type "
"according to min_calib_range.");
DMLC_DECLARE_FIELD(min_calib_range)
@@ -61,26 +60,6 @@ struct QuantizeV2Param : public
dmlc::Parameter<QuantizeV2Param> {
}
};
-static mshadow::TypeFlag GetOutputType(const QuantizeV2Param ¶m) {
- auto out_type = mshadow::kInt8;
- if (param.out_type == QuantizeV2Param::OutType::kAuto) {
- if (param.min_calib_range.has_value() &&
param.max_calib_range.has_value()) {
- if (param.min_calib_range.value() >= 0.0) {
- out_type = mshadow::kUint8;
- } else {
- out_type = mshadow::kInt8;
- }
- }
- } else if (param.out_type == QuantizeV2Param::OutType::kInt8) {
- out_type = mshadow::kInt8;
- } else if (param.out_type == QuantizeV2Param::OutType::kUint8) {
- out_type = mshadow::kUint8;
- } else {
- LOG(FATAL) << "Unsupported out_type in params: " <<param.out_type;
- }
- return out_type;
-}
-
// quantize float to uint8_t
struct quantize_v2_unsigned {
template <typename DstDType, typename SrcDType>
@@ -143,7 +122,7 @@ static inline bool QuantizeV2Type(const nnvm::NodeAttrs
&attrs, std::vector<int>
const QuantizeV2Param ¶m = nnvm::get<QuantizeV2Param>(attrs.parsed);
CHECK(in_attrs->at(0) == mshadow::kFloat32 || in_attrs->at(0) ==
mshadow::kUint8 ||
in_attrs->at(0) == mshadow::kInt8);
- auto out_type = GetOutputType(param);
+ auto out_type = GetQuantizeOutputType(param);
if (out_type == mshadow::kUint8) {
TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
} else if (out_type == mshadow::kInt8) {
@@ -170,7 +149,7 @@ class QuantizeV2Operator {
using mshadow::red::limits::MinValue;
Stream<xpu> *s = ctx.get_stream<xpu>();
const QuantizeV2Param ¶m = nnvm::get<QuantizeV2Param>(attrs_.parsed);
- auto out_type = GetOutputType(param);
+ auto out_type = GetQuantizeOutputType(param);
if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
"please switch to the context of CPU or int8 data type for
GPU.";
diff --git a/src/operator/quantization/quantized_conv.cu
b/src/operator/quantization/quantized_conv.cu
index ee688c0..23c41a1 100644
--- a/src/operator/quantization/quantized_conv.cu
+++ b/src/operator/quantization/quantized_conv.cu
@@ -174,7 +174,7 @@ class QuantizedCuDNNConvOp {
// of in_data[0] and in_data[1]. Need to rescale the min/max range of
out_data
// based on the min/max ranges of in_data[0] and in_data[1].
const size_t num_inputs = param_.no_bias ? 2 : 3;
- mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, gpu>::Launch(s,
1,
+ mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct,
gpu>::Launch(s, 1,
out_data[1].dptr<float>(), out_data[2].dptr<float>(),
in_data[num_inputs].dptr<float>(), in_data[num_inputs+1].dptr<float>(),
in_data[num_inputs+2].dptr<float>(),
in_data[num_inputs+3].dptr<float>());
diff --git a/src/operator/quantization/quantized_fully_connected.cc
b/src/operator/quantization/quantized_fully_connected.cc
index e42ea30..ceac0b6 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -233,7 +233,7 @@ void QuantizedFullyConnectedForwardCPU(const
nnvm::NodeAttrs& attrs,
Tensor<cpu, 1, float> max_weight =
in_data[num_inputs + quantized_fullc::kWeightMax].get<cpu, 1, float>(s);
- Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
min_output.dptr_,
+ Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(s, 1,
min_output.dptr_,
max_output.dptr_, min_data.dptr_, max_data.dptr_, min_weight.dptr_,
max_weight.dptr_);
if (!param.no_bias) {
Tensor<cpu, 1, int8_t> bias = in_data[fullc::kBias].get_with_shape<cpu, 1,
int8_t>(
diff --git a/src/operator/quantization/quantized_fully_connected.cu
b/src/operator/quantization/quantized_fully_connected.cu
index d1cbdc9..04680c8 100644
--- a/src/operator/quantization/quantized_fully_connected.cu
+++ b/src/operator/quantization/quantized_fully_connected.cu
@@ -109,7 +109,7 @@ void QuantizedFullyConnectedForwardGPU(const
nnvm::NodeAttrs& attrs,
cmp_type,
CUBLAS_GEMM_DFALT));
- Kernel<QuantizationRangeForMultiplicationStruct, gpu>::Launch(s, 1,
+ Kernel<QuantizationRangeForS8S8MultiplicationStruct, gpu>::Launch(s, 1,
outputs[1].dptr<float>(), outputs[2].dptr<float>(),
inputs[num_inputs].dptr<float>(), inputs[num_inputs+1].dptr<float>(),
inputs[num_inputs+2].dptr<float>(), inputs[num_inputs+3].dptr<float>());
diff --git a/src/operator/quantization/requantize-inl.h
b/src/operator/quantization/requantize-inl.h
index 9106c7f..2bdc3a7 100644
--- a/src/operator/quantization/requantize-inl.h
+++ b/src/operator/quantization/requantize-inl.h
@@ -38,9 +38,17 @@ namespace mxnet {
namespace op {
struct RequantizeParam : public dmlc::Parameter<RequantizeParam> {
+ int out_type;
dmlc::optional<float> min_calib_range; // min float value calculated from
calibration dataset
dmlc::optional<float> max_calib_range; // max float value calculated from
calibration dataset
DMLC_DECLARE_PARAMETER(RequantizeParam) {
+ DMLC_DECLARE_FIELD(out_type)
+ .add_enum("auto", QuantizeOutType::kAuto)
+ .add_enum("int8", QuantizeOutType::kInt8)
+ .add_enum("uint8", QuantizeOutType::kUint8)
+ .set_default(QuantizeOutType::kInt8)
+ .describe("Output data type. `auto` can be specified to automatically
determine output type "
+ "according to min_calib_range.");
DMLC_DECLARE_FIELD(min_calib_range)
.set_default(dmlc::optional<float>())
.describe("The minimum scalar value in the form of float32 obtained "
@@ -59,10 +67,18 @@ inline bool RequantizeType(const nnvm::NodeAttrs& attrs,
std::vector<int> *out_attrs) {
CHECK_EQ(in_attrs->size(), 3U);
CHECK_EQ(out_attrs->size(), 3U);
+ const RequantizeParam ¶m = nnvm::get<RequantizeParam>(attrs.parsed);
TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt32);
TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
- TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+ auto out_type = GetQuantizeOutputType(param);
+ if (out_type == mshadow::kUint8) {
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
+ } else if (out_type == mshadow::kInt8) {
+ TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+ } else {
+ LOG(FATAL) << "requantize op only supports int8 and uint8 as output type";
+ }
TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
return (*in_attrs)[0] != -1;
@@ -100,6 +116,11 @@ void RequantizeForward(const nnvm::NodeAttrs& attrs,
Stream<xpu> *s = ctx.get_stream<xpu>();
const RequantizeParam& param =
nnvm::get<RequantizeParam>(attrs.parsed);
+ auto out_type = GetQuantizeOutputType(param);
+ if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
+ LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
+ "please switch to the context of CPU or int8 data type for
GPU.";
+ }
if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
Kernel<RequantizeKernel, xpu>::Launch(s, inputs[0].Size(),
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index e142fae..2c05fda 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -31,6 +31,9 @@
namespace mxnet {
namespace op {
+using red::limits::MaxValue;
+using red::limits::MinValue;
+
template <typename DType>
static void UpdateConvWeightBias(NDArray *weight, NDArray *bias, bool no_bias,
const NDArray &gamma, const NDArray &beta,
@@ -78,8 +81,6 @@ static inline size_t GetInSumIndex(const
MKLDNNConvFusionParam ¶m) {
template <typename DType>
static std::vector<float> GetWeightScales(const NDArray &weight, bool
weight_channelwise_scale) {
- using red::limits::MaxValue;
- using red::limits::MinValue;
std::vector<float> weight_scales;
const DType *weight_ptr = weight.data().dptr<DType>();
size_t channel = weight.shape()[0];
@@ -111,9 +112,11 @@ static std::vector<float> GetWeightScales(const NDArray
&weight, bool weight_cha
if (total_min > weight_c_min[c]) total_min = weight_c_min[c];
if (total_max < weight_c_max[c]) total_max = weight_c_max[c];
}
- weight_scales.resize(1);
+ weight_scales.resize(3);
DType weight_range = MaxAbs(total_min, total_max);
weight_scales[0] = kInt8Range / weight_range;
+ weight_scales[1] = total_min;
+ weight_scales[2] = total_max;
}
return weight_scales;
}
@@ -247,11 +250,24 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
if (!inplace_) {
auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
+ if (outputs[kOut].dtype() == mshadow::kInt32) {
+ auto mem_desc = in_mkl_mem->get_primitive_desc().desc();
+ auto this_dtype = get_mkldnn_type(mshadow::kInt32);
+ mkldnn::memory::desc omd(
+ mkldnn::memory::dims(mem_desc.data.dims, mem_desc.data.dims +
mem_desc.data.ndims),
+ this_dtype,
static_cast<mkldnn::memory::format>(mem_desc.data.format));
+ mkldnn::memory::primitive_desc opd(omd,
CpuEngine::Get()->get_engine());
+ mkldnn_mem_ptr tmp_mem(new mkldnn::memory(opd,
out_mkl_mem->get_data_handle()));
+ MKLDNNStream::Get()->RegisterMem(tmp_mem);
+ MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*in_mkl_mem,
*tmp_mem));
+ output = NDArray(tmp_mem);
+ } else {
mkldnn_mem_ptr tmp_mem(
new mkldnn::memory(in_mkl_mem->get_primitive_desc(),
out_mkl_mem->get_data_handle()));
MKLDNNStream::Get()->RegisterMem(tmp_mem);
mxnet::MKLDNNCopy(*in_mkl_mem, tmp_mem.get());
output = NDArray(tmp_mem);
+ }
}
}
@@ -327,7 +343,8 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
float quantized_out_range;
float output_scale;
if (mkldnn_param.with_sum) {
- auto quantized_sum_range = cached_sum_min_ < 0 ? kInt8Range :
kUint8Range;
+ auto quantized_sum_range =
+ (inputs[in_sum].dtype() == mshadow::kInt8) ? kInt8Range :
kUint8Range;
sum_in_scale = quantized_sum_range / MaxAbs(cached_sum_min_,
cached_sum_max_);
}
if (post_requantize_) {
@@ -339,11 +356,23 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
full_conv_param.requantize_scales[c] = output_scale / data_scale_ /
weight_scales_[c];
}
} else {
+ Stream<cpu> *s = ctx.get_stream<cpu>();
+ if (data.dtype() == mshadow::kInt8) {
+ mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct,
cpu>::Launch(
+ s, 1, &cached_output_min_, &cached_output_max_,
&weight_scales_[1],
+ &weight_scales_[2], &cached_data_min_, &cached_data_max_);
+ } else {
+ mxnet_op::Kernel<QuantizationRangeForS8U8MultiplicationStruct,
cpu>::Launch(
+ s, 1, &cached_output_min_, &cached_output_max_,
&weight_scales_[1],
+ &weight_scales_[2], &cached_data_min_, &cached_data_max_);
+ }
+ weight_scales_.resize(1);
output_scale = data_scale_ * weight_scales_[0];
full_conv_param.requantize_scales.resize(0);
}
- if (mkldnn_param.with_sum)
+ if (mkldnn_param.with_sum) {
full_conv_param.sum_scale = output_scale / sum_in_scale;
+ }
}
fwd_.reset(new MKLDNNConvForward(
full_conv_param, ctx.is_train, data, cached_weight_,
@@ -375,11 +404,10 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
MKLDNNConvolutionForwardFullFeature(full_conv_param, ctx, fwd_.get(),
new_inputs, new_req,
{output});
}
- if (post_requantize_) {
- float *out_min_ptr = outputs[kMin].data().dptr<float>();
- float *out_max_ptr = outputs[kMax].data().dptr<float>();
- *out_min_ptr = cached_output_min_;
- *out_max_ptr = cached_output_max_;
+
+ if (mkldnn_param.quantized) {
+ *outputs[kMin].data().dptr<float>() = cached_output_min_;
+ *outputs[kMax].data().dptr<float>() = cached_output_max_;
}
if (mkldnn_param.with_sum) {
auto out = const_cast<NDArray &>(outputs[kOut]);
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
index 857a27d..f345a18 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
@@ -174,7 +174,7 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
MaxAbs(cached_min_output_, cached_max_output_) / data_scale /
weight_scale;
} else {
Stream<cpu> *s = ctx.get_stream<cpu>();
- mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct,
cpu>::Launch(
+ mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct,
cpu>::Launch(
s, 1, &cached_min_output_, &cached_max_output_,
&min_data, &max_data, &min_weight, &max_weight);
}
diff --git a/tests/python/quantization/test_quantization.py
b/tests/python/quantization/test_quantization.py
index 3c8cc42..a65a9e7 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -85,9 +85,9 @@ def test_dequantize_int8_to_float32():
sym_data = mx.sym.Variable('data')
sym_min_range = mx.sym.Variable('min_range')
sym_max_range = mx.sym.Variable('max_range')
- dequant = mx.sym.contrib.dequantize(sym_data, sym_min_range,
+ dequant = mx.sym.contrib.dequantize(sym_data, sym_min_range,
sym_max_range, out_type='float32')
- out = dequant.bind(ctx=mx.current_context(),
+ out = dequant.bind(ctx=mx.current_context(),
args={'data':qdata, 'min_range':min_range,
'max_range':max_range})
data = out.forward()[0]
assert data.dtype == np.float32
@@ -141,7 +141,8 @@ def test_requantize_int32_to_int8():
qdata_int8, min_output, max_output =
mx.nd.contrib.requantize(qdata, min_range, max_range)
else:
qdata_int8, min_output, max_output =
mx.nd.contrib.requantize(qdata, min_range, max_range,
-
min_calib_range, max_calib_range)
+
min_calib_range=min_calib_range,
+
max_calib_range=max_calib_range)
qdata_int8_np, min_output_np, max_output_np =
requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
max_range.asscalar(),
@@ -150,7 +151,7 @@ def test_requantize_int32_to_int8():
assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np, atol = 1)
assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
-
+
def check_requantize_with_symbol(shape, min_calib_range=None,
max_calib_range=None):
qdata = mx.nd.random.uniform(low=-1000.0, high=1000.0,
shape=shape).astype('int32')
min_range = mx.nd.array([-1010.0])
@@ -160,17 +161,18 @@ def test_requantize_int32_to_int8():
sym_max_range = mx.sym.Variable('max_range')
if min_calib_range is None or max_calib_range is None:
requant = mx.sym.contrib.requantize(sym_data, sym_min_range,
sym_max_range)
- out = requant.bind(ctx=mx.current_context(),
- args={'data':qdata, 'min_range':min_range,
- 'max_range':max_range})
+ out = requant.bind(ctx=mx.current_context(),
+ args={'data':qdata, 'min_range':min_range,
+ 'max_range':max_range})
qdata_int8, min_output, max_output = out.forward()
else:
- requant = mx.sym.contrib.requantize(sym_data, sym_min_range,
sym_max_range,
- min_calib_range,
max_calib_range)
- out = requant.bind(ctx=mx.current_context(), args={'data':qdata,
'min_range':min_range,
- 'max_range':max_range})
- qdata_int8, min_output, max_output = out.forward()
-
+ requant = mx.sym.contrib.requantize(sym_data, sym_min_range,
sym_max_range,
+
min_calib_range=min_calib_range,
+
max_calib_range=max_calib_range)
+ out = requant.bind(ctx=mx.current_context(), args={'data':qdata,
'min_range':min_range,
+ 'max_range':max_range})
+ qdata_int8, min_output, max_output = out.forward()
+
qdata_int8_np, min_output_np, max_output_np =
requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
max_range.asscalar(),
min_calib_range=min_calib_range,