elvin-n commented on a change in pull request #8883:
URL: https://github.com/apache/tvm/pull/8883#discussion_r702659964
##########
File path: python/tvm/ir/affine_type.py
##########
@@ -50,8 +50,10 @@ class TensorAffineType(AffineType):
The content data type.
"""
- def __init__(self, scale, zero_point, dtype):
- self.__init_handle_by_constructor__(_ffi_api.TensorAffineType, scale,
zero_point, dtype)
+ def __init__(self, scale, zero_point, dtype, axis=-1):
Review comment:
Currently scalar is scalar in most cases and more common to have value
for broadcasting for whole tensor. -1 should be fine
##########
File path: python/tvm/relay/transform/fake_quantization_to_integer.py
##########
@@ -198,19 +219,51 @@ def clip(expr, type_map):
amax = expr.attrs.a_max
scale = fold_constant(t.scale)
z_p = fold_constant(t.zero_point)
- if isinstance(scale, relay.expr.Constant) and isinstance(z_p,
relay.expr.Constant):
+ if (
+ isinstance(scale, relay.expr.Constant)
+ and scale.data.numpy().size == 1
+ and isinstance(z_p, relay.expr.Constant)
+ and z_p.data.numpy().size == 1
+ ):
scale = scale.data.numpy().item()
z_p = z_p.data.numpy().item()
new_min = int(amin / scale + z_p)
new_max = int(amax / scale + z_p)
out = relay.op.clip(arg, new_min, new_max)
else:
- amin = relay.op.round(relay.op.const(amin) / scale + z_p)
- amax = relay.op.round(relay.op.const(amax) / scale + z_p)
- out = relay.op.minimum(relay.op.maximum(arg, amin), amax)
+ if not isinstance(amin, relay.expr.Constant):
+ amin = relay.op.const(amin)
+ if not isinstance(amax, relay.expr.Constant):
+ amax = relay.op.const(amax)
+
+ scale_shape = infer_shape(scale)
+ if len(scale_shape) > 0 and scale_shape[0] > 1:
+ b_shape = [1] * len(infer_shape(arg))
+ b_shape[t.axis] = -1
+ amin = relay.op.reshape(relay.op.broadcast_to(amin, scale_shape),
b_shape)
+ amax = relay.op.reshape(relay.op.broadcast_to(amax, scale_shape),
b_shape)
+ amin = relay.qnn.op.quantize(amin, scale, z_p, t.axis, t.dtype)
+ amax = relay.qnn.op.quantize(amax, scale, z_p, t.axis, t.dtype)
+ out = relay.op.minimum(relay.op.maximum(arg, fold_constant(amin)),
fold_constant(amax))
+
return [out, t]
+@register_fake_quantization_to_integer("nn.relu")
+def relu(expr, type_map):
+ """Rewrite a relu op"""
+ arg = expr.args[0]
+ t = type_map[arg]
+ scale_shape = infer_shape(t.scale)
+ z_p = t.zero_point
Review comment:
we might have here 4 situations
1. scale is scalar, zp is scalar
2. scale is scalar, zp is not scalar
3. scale is not scalse, zp is scalar
4. scale is not scalar, zp is not scalar
cases 3 and 4 are covered by next if, we broadcast zp to scale shapes by
axist from AffineType structure
Q: will zp by updated in-place in the TensorAffineType map after broadcast?
case 1 is ok
case 2 - Q: don't we need to handle this explicitly and broadcast scale?
##########
File path: python/tvm/relay/transform/fake_quantization_to_integer.py
##########
@@ -198,19 +219,51 @@ def clip(expr, type_map):
amax = expr.attrs.a_max
scale = fold_constant(t.scale)
z_p = fold_constant(t.zero_point)
- if isinstance(scale, relay.expr.Constant) and isinstance(z_p,
relay.expr.Constant):
+ if (
+ isinstance(scale, relay.expr.Constant)
+ and scale.data.numpy().size == 1
+ and isinstance(z_p, relay.expr.Constant)
+ and z_p.data.numpy().size == 1
+ ):
scale = scale.data.numpy().item()
z_p = z_p.data.numpy().item()
new_min = int(amin / scale + z_p)
new_max = int(amax / scale + z_p)
out = relay.op.clip(arg, new_min, new_max)
else:
- amin = relay.op.round(relay.op.const(amin) / scale + z_p)
- amax = relay.op.round(relay.op.const(amax) / scale + z_p)
- out = relay.op.minimum(relay.op.maximum(arg, amin), amax)
+ if not isinstance(amin, relay.expr.Constant):
+ amin = relay.op.const(amin)
+ if not isinstance(amax, relay.expr.Constant):
+ amax = relay.op.const(amax)
+
+ scale_shape = infer_shape(scale)
+ if len(scale_shape) > 0 and scale_shape[0] > 1:
+ b_shape = [1] * len(infer_shape(arg))
+ b_shape[t.axis] = -1
+ amin = relay.op.reshape(relay.op.broadcast_to(amin, scale_shape),
b_shape)
+ amax = relay.op.reshape(relay.op.broadcast_to(amax, scale_shape),
b_shape)
+ amin = relay.qnn.op.quantize(amin, scale, z_p, t.axis, t.dtype)
+ amax = relay.qnn.op.quantize(amax, scale, z_p, t.axis, t.dtype)
+ out = relay.op.minimum(relay.op.maximum(arg, fold_constant(amin)),
fold_constant(amax))
+
return [out, t]
+@register_fake_quantization_to_integer("nn.relu")
+def relu(expr, type_map):
+ """Rewrite a relu op"""
+ arg = expr.args[0]
+ t = type_map[arg]
+ scale_shape = infer_shape(t.scale)
+ z_p = t.zero_point
+ if len(scale_shape) > 0 and scale_shape[0] > 1:
Review comment:
Why do we care only about 1st value in shape? is it because we expect to
have only vector? why not to verify `len(scale_shape)==1` then?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]