ZhennanQin commented on a change in pull request #14641: [MKLDNN]Improve
quantizeV2 and dequantize latency
URL: https://github.com/apache/incubator-mxnet/pull/14641#discussion_r276048906
##########
File path: src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
##########
@@ -137,21 +75,101 @@ static void MKLDNNQuantizeV2Compute(const
nnvm::NodeAttrs& attrs, const OpContex
}
}
if (req[0] != kWriteInplace) {
- const_cast<NDArray&>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
+ const_cast<NDArray &>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
MKLDNNStream::Get()->Submit();
}
} else {
- auto out_type = GetOutputType(param);
+ if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer =
inputs[0].Reorder2Default();
+ auto i_mem = in_buffer.GetMKLDNNData();
+
+ if (param_.min_calib_range.has_value() &&
param_.max_calib_range.has_value()) {
+ data_min = param_.min_calib_range.value();
+ data_max = param_.max_calib_range.value();
+ } else {
+ // no calib info
+ in_buffer = inputs[0].Reorder2Default();
+ auto in_ptr = in_buffer.data().dptr<float>();
+ auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+ std::vector<float> data_maxs(nthreads, data_max);
+ std::vector<float> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+ for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size());
i++) {
+ int tid = omp_get_thread_num();
+ if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+ if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+ }
+ for (index_t i = 0; i < nthreads; i++) {
+ if (data_maxs[i] > data_max) data_max = data_maxs[i];
+ if (data_mins[i] < data_min) data_min = data_mins[i];
+ }
+ }
+
+ // Write output min/max
+ auto out_type = GetOutputType(param_);
if (out_type == mshadow::kUint8) {
- MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+ quantized_range = kUint8Range;
+ *outputs[1].data().dptr<float>() = data_min;
+ *outputs[2].data().dptr<float>() = data_max;
} else if (out_type == mshadow::kInt8) {
- MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+ float real_range = MaxAbs(data_min, data_max);
+ quantized_range = kInt8Range;
+ *outputs[1].data().dptr<float>() = -real_range;
+ *outputs[2].data().dptr<float>() = real_range;
} else {
LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output
type";
}
+
+ if (initalized_ && (cached_data_min_ != data_min || cached_data_max_ !=
data_max))
+ initalized_ = false;
Review comment:
epsilon isn't necessary for this float comparison, because
`cached_data_min_` is directly assigned from last used `data_min`, so
`cached_data_min_` and `data_min` should be bit-wise identity if they are the
same. Also, even if this check failed, the only bad thing is this op will be
re-initialized and spent extra time to re-create mkldnn primitive. The output
accuracy can still guarantee.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services