jverma-quic commented on a change in pull request #8773:
URL: https://github.com/apache/tvm/pull/8773#discussion_r695936244
##########
File path: python/tvm/relay/frontend/onnx.py
##########
@@ -3279,6 +3279,40 @@ def get_scalar(x, dtype="float32"):
return _qnn.op.quantize(out, c_scale, c_zero_point, out_dtype=dtype)
+class QLinearMatMul(OnnxOpConverter):
+ """Operator converter for QLinearMatMul from Microsoft onnxruntime contrib
opset."""
+
+ @classmethod
+ def _impl_v10(cls, inputs, attr, params):
+ def get_scalar(x, dtype="float32"):
+ if isinstance(x, _expr.Var) and x.name_hint in params:
+ return _op.const(params[x.name_hint].numpy(), dtype)
+ rank = len(infer_shape(x))
+ assert rank <= 1, "QLinearMatMul scale and zero_point input must
be scalars"
+ if rank == 1:
+ x = _op.squeeze(x, [0])
+ return _op.cast(x, dtype)
+
+ a = inputs[0]
+ a_scale = get_scalar(inputs[1])
+ a_zero_point = get_scalar(inputs[2], "int32")
+ b = inputs[3]
+ b_scale = get_scalar(inputs[4])
+ b_zero_point = get_scalar(inputs[5], "int32")
+ y_scale = fold_constant(get_scalar(inputs[6]))
+ y_zero_point = get_scalar(inputs[7], "int32")
+
+ dtype = infer_type(a).checked_type.dtype
+
+ ## Onnxruntime doesn't actually do this op in integer, they dequantize
to fp32
+ ## and then requantize afer
+ ##
https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/mlas/lib/qlmul.cpp
Review comment:
Even if onnxruntime is performing fp32 operations, is there any reason
to do the same here? Wouldn't it be better (at least some what for the
performance) to requantize both inputs 'a' and 'b' as per output scale and
zero_point and then perform integer matmul?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]