This is an automated email from the ASF dual-hosted git repository.
patriczhao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 5d2a451 Performance improving for MKL-DNN Quantized FullyConnected
(#14528)
5d2a451 is described below
commit 5d2a4510c2c226c6921a8a213d04461f68ca7173
Author: ciyong <[email protected]>
AuthorDate: Wed Mar 27 20:03:49 2019 +0800
Performance improving for MKL-DNN Quantized FullyConnected (#14528)
* Cached bias to Quantized FullyCOnnected based on Subgraph to improve
performance
* retrigger CI
* retrigger CI
---
src/operator/nn/fully_connected-inl.h | 6 ++++
.../mkldnn/mkldnn_quantized_fully_connected.cc | 21 +++++------
.../quantization/quantized_fully_connected.cc | 22 +++++++-----
src/operator/subgraph/mkldnn/mkldnn_fc.cc | 41 +++++++++++++---------
4 files changed, 52 insertions(+), 38 deletions(-)
diff --git a/src/operator/nn/fully_connected-inl.h
b/src/operator/nn/fully_connected-inl.h
index 93d384d..e4bb11f 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -48,6 +48,12 @@ enum FullyConnectedOpResource {kTempSpace};
enum FullyConnectedOpOutputs {kOut};
} // fullc
+namespace quantized_fullc {
+enum QuantizedFCInputMinMax {kDataMin, kDataMax, kWeightMin, kWeightMax,
kBiasMin, kBiasMax};
+enum QuantizedFCOutputs {kOut, kOutMin, kOutMax};
+} // quantized_fullc
+
+
struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
int num_hidden;
bool no_bias;
diff --git
a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
index 39f8116..71daf2e 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
@@ -31,11 +31,6 @@
namespace mxnet {
namespace op {
-namespace quantized_fc_enum {
-enum QuantizedFCInputMinMax { kDataMin, kDataMax, kWeightMin, kWeightMax,
kBiasMin, kBiasMax };
-enum QuantizedFCOutputs { kOut, kOutMin, kOutMax };
-}
-
void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
const OpContext &ctx,
const std::vector<NDArray> &in_data,
@@ -52,15 +47,15 @@ void MKLDNNQuantizedFullyConnectedForward(const
nnvm::NodeAttrs &attrs,
NDArray weight = in_data[fullc::kWeight];
const float min_data =
- in_data[num_inputs + quantized_fc_enum::kDataMin].data().dptr<float>()[0];
+ in_data[num_inputs + quantized_fullc::kDataMin].data().dptr<float>()[0];
const float max_data =
- in_data[num_inputs + quantized_fc_enum::kDataMax].data().dptr<float>()[0];
+ in_data[num_inputs + quantized_fullc::kDataMax].data().dptr<float>()[0];
const float min_weight =
- in_data[num_inputs +
quantized_fc_enum::kWeightMin].data().dptr<float>()[0];
+ in_data[num_inputs + quantized_fullc::kWeightMin].data().dptr<float>()[0];
const float max_weight =
- in_data[num_inputs +
quantized_fc_enum::kWeightMax].data().dptr<float>()[0];
- float *min_output_ptr =
out_data[quantized_fc_enum::kOutMin].data().dptr<float>();
- float *max_output_ptr =
out_data[quantized_fc_enum::kOutMax].data().dptr<float>();
+ in_data[num_inputs + quantized_fullc::kWeightMax].data().dptr<float>()[0];
+ float *min_output_ptr =
out_data[quantized_fullc::kOutMin].data().dptr<float>();
+ float *max_output_ptr =
out_data[quantized_fullc::kOutMax].data().dptr<float>();
auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range :
kUint8Range;
float data_scale = data_range / MaxAbs(min_data, max_data);
@@ -69,8 +64,8 @@ void MKLDNNQuantizedFullyConnectedForward(const
nnvm::NodeAttrs &attrs,
NDArray quantized_bias;
if (!param.no_bias) {
NDArray bias = in_data[fullc::kBias];
- float min_bias = in_data[num_inputs +
quantized_fc_enum::kBiasMin].data().dptr<float>()[0];
- float max_bias = in_data[num_inputs +
quantized_fc_enum::kBiasMax].data().dptr<float>()[0];
+ float min_bias = in_data[num_inputs +
quantized_fullc::kBiasMin].data().dptr<float>()[0];
+ float max_bias = in_data[num_inputs +
quantized_fullc::kBiasMax].data().dptr<float>()[0];
float bias_int32_rescale = data_scale * weight_scale * MaxAbs(min_bias,
max_bias) / kInt8Range;
quantized_bias = NDArray(bias.storage_type(), bias.shape(),
diff --git a/src/operator/quantization/quantized_fully_connected.cc
b/src/operator/quantization/quantized_fully_connected.cc
index 4718b3b..0a04e71 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -222,20 +222,26 @@ void QuantizedFullyConnectedForwardCPU(const
nnvm::NodeAttrs& attrs,
shiftdata.dptr_[i] = data_temp[i] + shift;
}
- Tensor<cpu, 1, float> min_output = out_data[1].get<cpu, 1, float>(s);
- Tensor<cpu, 1, float> max_output = out_data[2].get<cpu, 1, float>(s);
- Tensor<cpu, 1, float> min_data = in_data[num_inputs].get<cpu, 1, float>(s);
- Tensor<cpu, 1, float> max_data = in_data[num_inputs + 1].get<cpu, 1,
float>(s);
- Tensor<cpu, 1, float> min_weight = in_data[num_inputs + 2].get<cpu, 1,
float>(s);
- Tensor<cpu, 1, float> max_weight = in_data[num_inputs + 3].get<cpu, 1,
float>(s);
+ Tensor<cpu, 1, float> min_output =
out_data[quantized_fullc::kOutMin].get<cpu, 1, float>(s);
+ Tensor<cpu, 1, float> max_output =
out_data[quantized_fullc::kOutMax].get<cpu, 1, float>(s);
+ Tensor<cpu, 1, float> min_data =
+ in_data[num_inputs + quantized_fullc::kDataMin].get<cpu, 1, float>(s);
+ Tensor<cpu, 1, float> max_data =
+ in_data[num_inputs + quantized_fullc::kDataMax].get<cpu, 1, float>(s);
+ Tensor<cpu, 1, float> min_weight =
+ in_data[num_inputs + quantized_fullc::kWeightMin].get<cpu, 1, float>(s);
+ Tensor<cpu, 1, float> max_weight =
+ in_data[num_inputs + quantized_fullc::kWeightMax].get<cpu, 1, float>(s);
Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
min_output.dptr_,
max_output.dptr_, min_data.dptr_, max_data.dptr_, min_weight.dptr_,
max_weight.dptr_);
if (!param.no_bias) {
Tensor<cpu, 1, int8_t> bias = in_data[fullc::kBias].get_with_shape<cpu, 1,
int8_t>(
Shape1(wshape[0]), s);
- Tensor<cpu, 1, float> min_bias = in_data[num_inputs + 4].get<cpu, 1,
float>(s);
- Tensor<cpu, 1, float> max_bias = in_data[num_inputs + 5].get<cpu, 1,
float>(s);
+ Tensor<cpu, 1, float> min_bias =
+ in_data[num_inputs + quantized_fullc::kBiasMin].get<cpu, 1, float>(s);
+ Tensor<cpu, 1, float> max_bias =
+ in_data[num_inputs + quantized_fullc::kBiasMax].get<cpu, 1, float>(s);
Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.dptr_,
bias.dptr_, min_output.dptr_, max_output.dptr_, min_bias.dptr_,
max_bias.dptr_);
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
index c9e1e1c..0ec05a2 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
@@ -63,7 +63,6 @@ class SgMKLDNNFCOp {
nnvm::Symbol subgraph_sym_;
MKLDNNFCFullParam full_param_;
std::shared_ptr<MKLDNNFullyConnectedForward> fwd_;
- NDArray cached_weight_;
NDArray cached_bias_;
float cached_min_data_;
float cached_max_data_;
@@ -71,6 +70,8 @@ class SgMKLDNNFCOp {
float cached_max_weight_;
float cached_min_bias_;
float cached_max_bias_;
+ float cached_min_output_;
+ float cached_max_output_;
};
void SgMKLDNNFCOp::Forward(const OpContext &ctx,
@@ -91,23 +92,19 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
float max_weight = 0.0;
float min_bias = 0.0;
float max_bias = 0.0;
- float *min_output_ptr = nullptr;
- float *max_output_ptr = nullptr;
if (mkldnn_param.quantized) {
total_num_inputs = base_num_inputs * 3;
- min_data = in_data[base_num_inputs].data().dptr<float>()[0];
- max_data = in_data[base_num_inputs + 1].data().dptr<float>()[0];
- min_weight = in_data[base_num_inputs + 2].data().dptr<float>()[0];
- max_weight = in_data[base_num_inputs + 3].data().dptr<float>()[0];
+ min_data = in_data[base_num_inputs +
quantized_fullc::kDataMin].data().dptr<float>()[0];
+ max_data = in_data[base_num_inputs +
quantized_fullc::kDataMax].data().dptr<float>()[0];
+ min_weight = in_data[base_num_inputs +
quantized_fullc::kWeightMin].data().dptr<float>()[0];
+ max_weight = in_data[base_num_inputs +
quantized_fullc::kWeightMax].data().dptr<float>()[0];
if (has_bias) {
- min_bias = in_data[base_num_inputs + 4].data().dptr<float>()[0];
- max_bias = in_data[base_num_inputs + 5].data().dptr<float>()[0];
+ min_bias = in_data[base_num_inputs +
quantized_fullc::kBiasMin].data().dptr<float>()[0];
+ max_bias = in_data[base_num_inputs +
quantized_fullc::kBiasMax].data().dptr<float>()[0];
}
if (!mkldnn_param.enable_float_output) {
total_num_outputs = base_num_outputs * 3;
- min_output_ptr = out_data[1].data().dptr<float>();
- max_output_ptr = out_data[2].data().dptr<float>();
}
}
CHECK_EQ(in_data.size(), total_num_inputs);
@@ -135,6 +132,8 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
cached_max_weight_ = max_weight;
if (has_bias) {
cached_bias_ = in_data[fullc::kBias];
+ cached_min_bias_ = min_bias;
+ cached_max_bias_ = max_bias;
} else {
cached_bias_ = NDArray();
}
@@ -149,7 +148,7 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
if (has_bias) {
NDArray bias = in_data[fullc::kBias];
float bias_int32_rescale = data_scale * weight_scale *
- MaxAbs(min_bias, max_bias) / kInt8Range;
+ MaxAbs(cached_min_bias_, cached_max_bias_) / kInt8Range;
cached_bias_ = NDArray(bias.storage_type(), bias.shape(),
bias.ctx(), true, mshadow::kInt32);
@@ -168,15 +167,16 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
} else if (mkldnn_param.min_calib_range.has_value() &&
mkldnn_param.max_calib_range.has_value()) {
full_param_.output_scales.resize(0);
- *min_output_ptr = mkldnn_param.min_calib_range.value();
- *max_output_ptr = mkldnn_param.max_calib_range.value();
+ cached_min_output_ = mkldnn_param.min_calib_range.value();
+ cached_max_output_ = mkldnn_param.max_calib_range.value();
full_param_.requantize_scales[0] = quantized_out_range /
- MaxAbs(*min_output_ptr, *max_output_ptr) / data_scale / weight_scale;
+ MaxAbs(cached_min_output_, cached_max_output_) / data_scale /
weight_scale;
} else {
Stream<cpu> *s = ctx.get_stream<cpu>();
- mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct,
cpu>::Launch(s, 1,
- min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight,
&max_weight);
+ mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct,
cpu>::Launch(
+ s, 1, &cached_min_output_, &cached_max_output_,
+ &min_data, &max_data, &min_weight, &max_weight);
}
}
@@ -195,6 +195,13 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
}
MKLDNNFCForwardFullFeature(full_param_, ctx, fwd_.get(), new_inputs,
new_req, out_data);
+
+ if (mkldnn_param.quantized && !mkldnn_param.enable_float_output) {
+ float *min_output_ptr =
out_data[quantized_fullc::kOutMin].data().dptr<float>();
+ float *max_output_ptr =
out_data[quantized_fullc::kOutMax].data().dptr<float>();
+ *min_output_ptr = cached_min_output_;
+ *max_output_ptr = cached_max_output_;
+ }
}
static void SgMKLDNNFCParamParser(nnvm::NodeAttrs *attrs) {