[GitHub] [tvm] elvin-n commented on a change in pull request #8883: [Relay][Quantization] Per-Channel FQ2I

GitBox Mon, 06 Sep 2021 01:45:03 -0700


elvin-n commented on a change in pull request #8883:
URL: https://github.com/apache/tvm/pull/8883#discussion_r702659964




##########
File path: python/tvm/ir/affine_type.py
##########
@@ -50,8 +50,10 @@ class TensorAffineType(AffineType):
         The content data type.
     """
 
-    def __init__(self, scale, zero_point, dtype):
-        self.__init_handle_by_constructor__(_ffi_api.TensorAffineType, scale, 
zero_point, dtype)
+    def __init__(self, scale, zero_point, dtype, axis=-1):

Review comment:
       Currently scalar is scalar in most cases and more common to have value 
for broadcasting for whole tensor. -1 should be fine

##########
File path: python/tvm/relay/transform/fake_quantization_to_integer.py
##########
@@ -198,19 +219,51 @@ def clip(expr, type_map):
     amax = expr.attrs.a_max
     scale = fold_constant(t.scale)
     z_p = fold_constant(t.zero_point)
-    if isinstance(scale, relay.expr.Constant) and isinstance(z_p, 
relay.expr.Constant):
+    if (
+        isinstance(scale, relay.expr.Constant)
+        and scale.data.numpy().size == 1
+        and isinstance(z_p, relay.expr.Constant)
+        and z_p.data.numpy().size == 1
+    ):
         scale = scale.data.numpy().item()
         z_p = z_p.data.numpy().item()
         new_min = int(amin / scale + z_p)
         new_max = int(amax / scale + z_p)
         out = relay.op.clip(arg, new_min, new_max)
     else:
-        amin = relay.op.round(relay.op.const(amin) / scale + z_p)
-        amax = relay.op.round(relay.op.const(amax) / scale + z_p)
-        out = relay.op.minimum(relay.op.maximum(arg, amin), amax)
+        if not isinstance(amin, relay.expr.Constant):
+            amin = relay.op.const(amin)
+        if not isinstance(amax, relay.expr.Constant):
+            amax = relay.op.const(amax)
+
+        scale_shape = infer_shape(scale)
+        if len(scale_shape) > 0 and scale_shape[0] > 1:
+            b_shape = [1] * len(infer_shape(arg))
+            b_shape[t.axis] = -1
+            amin = relay.op.reshape(relay.op.broadcast_to(amin, scale_shape), 
b_shape)
+            amax = relay.op.reshape(relay.op.broadcast_to(amax, scale_shape), 
b_shape)
+        amin = relay.qnn.op.quantize(amin, scale, z_p, t.axis, t.dtype)
+        amax = relay.qnn.op.quantize(amax, scale, z_p, t.axis, t.dtype)
+        out = relay.op.minimum(relay.op.maximum(arg, fold_constant(amin)), 
fold_constant(amax))
+
     return [out, t]
 
 
+@register_fake_quantization_to_integer("nn.relu")
+def relu(expr, type_map):
+    """Rewrite a relu op"""
+    arg = expr.args[0]
+    t = type_map[arg]
+    scale_shape = infer_shape(t.scale)
+    z_p = t.zero_point

Review comment:
       we might have here 4 situations
   1. scale is scalar, zp is scalar
   2. scale is scalar, zp is not scalar
   3. scale is not scalse, zp is scalar
   4. scale is not scalar, zp is not scalar
   
   cases 3 and 4 are covered by next if, we broadcast zp to scale shapes by 
axist from AffineType structure
   Q: will zp by updated in-place in the TensorAffineType map after broadcast?
   
   case 1 is ok
   
   case 2 - Q: don't we need to handle this explicitly and broadcast scale?

##########
File path: python/tvm/relay/transform/fake_quantization_to_integer.py
##########
@@ -198,19 +219,51 @@ def clip(expr, type_map):
     amax = expr.attrs.a_max
     scale = fold_constant(t.scale)
     z_p = fold_constant(t.zero_point)
-    if isinstance(scale, relay.expr.Constant) and isinstance(z_p, 
relay.expr.Constant):
+    if (
+        isinstance(scale, relay.expr.Constant)
+        and scale.data.numpy().size == 1
+        and isinstance(z_p, relay.expr.Constant)
+        and z_p.data.numpy().size == 1
+    ):
         scale = scale.data.numpy().item()
         z_p = z_p.data.numpy().item()
         new_min = int(amin / scale + z_p)
         new_max = int(amax / scale + z_p)
         out = relay.op.clip(arg, new_min, new_max)
     else:
-        amin = relay.op.round(relay.op.const(amin) / scale + z_p)
-        amax = relay.op.round(relay.op.const(amax) / scale + z_p)
-        out = relay.op.minimum(relay.op.maximum(arg, amin), amax)
+        if not isinstance(amin, relay.expr.Constant):
+            amin = relay.op.const(amin)
+        if not isinstance(amax, relay.expr.Constant):
+            amax = relay.op.const(amax)
+
+        scale_shape = infer_shape(scale)
+        if len(scale_shape) > 0 and scale_shape[0] > 1:
+            b_shape = [1] * len(infer_shape(arg))
+            b_shape[t.axis] = -1
+            amin = relay.op.reshape(relay.op.broadcast_to(amin, scale_shape), 
b_shape)
+            amax = relay.op.reshape(relay.op.broadcast_to(amax, scale_shape), 
b_shape)
+        amin = relay.qnn.op.quantize(amin, scale, z_p, t.axis, t.dtype)
+        amax = relay.qnn.op.quantize(amax, scale, z_p, t.axis, t.dtype)
+        out = relay.op.minimum(relay.op.maximum(arg, fold_constant(amin)), 
fold_constant(amax))
+
     return [out, t]
 
 
+@register_fake_quantization_to_integer("nn.relu")
+def relu(expr, type_map):
+    """Rewrite a relu op"""
+    arg = expr.args[0]
+    t = type_map[arg]
+    scale_shape = infer_shape(t.scale)
+    z_p = t.zero_point
+    if len(scale_shape) > 0 and scale_shape[0] > 1:

Review comment:
       Why do we care only about 1st value in shape? is it because we expect to 
have only vector? why not to verify `len(scale_shape)==1` then?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] elvin-n commented on a change in pull request #8883: [Relay][Quantization] Per-Channel FQ2I

Reply via email to