jcf94 commented on a change in pull request #8402:
URL: https://github.com/apache/tvm/pull/8402#discussion_r665286811



##########
File path: python/tvm/topi/cuda/dense_tensorcore.py
##########
@@ -127,33 +131,36 @@ def _schedule_dense_tensorcore(cfg, s, C):
     cfg.define_knob("offsetCS", [0, 8])
     cfg.define_knob("vec", [1, 2, 4, 8])
 
-    # Ensure that the default parameters are applicable when autotvm is not in 
use
-    if batch % 32 == 0 and out_dim % 8 == 0:
-        cfg.define_knob("wmma_m", [32, 16, 8])
-    elif batch % 16 == 0 and out_dim % 16 == 0:
-        cfg.define_knob("wmma_m", [16, 8, 32])
-    elif batch % 8 == 0 and out_dim % 32 == 0:
-        cfg.define_knob("wmma_m", [8, 16, 32])
+    if data_dtype in ["float16", "int8", "uint8"]:
+        # Ensure that the default parameters are applicable when autotvm is 
not in use
+        if batch % 32 == 0 and out_dim % 8 == 0:
+            cfg.define_knob("wmma_m", [32, 16, 8])
+        elif batch % 16 == 0 and out_dim % 16 == 0:
+            cfg.define_knob("wmma_m", [16, 8, 32])
+        elif batch % 8 == 0 and out_dim % 32 == 0:
+            cfg.define_knob("wmma_m", [8, 16, 32])
+        wmma_k = 16
+        wmma_m = cfg["wmma_m"].val
+        if wmma_m == 16:
+            wmma_n = 16
+        elif wmma_m == 8:
+            wmma_n = 32
+        elif wmma_m == 32:
+            wmma_n = 8
+    else:

Review comment:
       ditto.

##########
File path: python/tvm/topi/cuda/batch_matmul_tensorcore.py
##########
@@ -94,32 +92,35 @@ def _schedule(cfg, s, C):
         cfg.define_knob("vec", [1, 2, 4, 8])
 
         # Ensure that the default parameters are applicable when autotvm is 
not in use
-        if m_dim % 32 == 0 and n_dim % 8 == 0:
-            cfg.define_knob("wmma_m", [32, 16, 8])
-        elif m_dim % 16 == 0 and n_dim % 16 == 0:
-            cfg.define_knob("wmma_m", [16, 8, 32])
-        elif m_dim % 8 == 0 and n_dim % 32 == 0:
-            cfg.define_knob("wmma_m", [8, 16, 32])
+        if data_dtype in ["float16", "uint8", "int8"]:
+            if m_dim % 32 == 0 and n_dim % 8 == 0:
+                cfg.define_knob("wmma_m", [32, 16, 8])
+            elif m_dim % 16 == 0 and n_dim % 16 == 0:
+                cfg.define_knob("wmma_m", [16, 8, 32])
+            elif m_dim % 8 == 0 and n_dim % 32 == 0:
+                cfg.define_knob("wmma_m", [8, 16, 32])
+            wmma_k = 16
+            wmma_m = cfg["wmma_m"].val
+            if wmma_m == 16:
+                wmma_n = 16
+            elif wmma_m == 8:
+                wmma_n = 32
+            elif wmma_m == 32:
+                wmma_n = 8
+        else:

Review comment:
       Does the else branch for int4?
   Even through we can assume the op strategy has already done the type check, 
still suggest better to specify it clearly in the code, then add an extra else 
branch to raise warnning.

##########
File path: python/tvm/topi/cuda/tensorcore_alter_op.py
##########
@@ -70,31 +70,38 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
         ):
             # no need to pad
             return None
-
         candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
-
-        if extra_flops > 2:
-            logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops 
%s", extra_flops)
+    elif dtype in ["int4", "uint4"]:
+        if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
+            # no need to pad
             return None
 
-        logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", 
extra_flops)
-        if dm or dk:
-            x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
-        else:
-            x_ = x
-        if dn or dk:
-            y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
-        else:
-            y_ = y
-        out_ = relay.nn.batch_matmul(x_, y_)
-        if dm or dn:
-            original_out_shape = [x.value for x in output_tensor.shape]
-            out = relay.strided_slice(out_, begin=[0, 0, 0], 
end=original_out_shape)
-        else:
-            out = out_
-        return out
-    return None
+        candidates = [(8, 32, 8)]
+    else:
+        return None
+
+    (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
+
+    if extra_flops > 2:
+        logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", 
extra_flops)
+        return None
+
+    logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
+    if dm or dk:
+        x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+    else:
+        x_ = x
+    if dn or dk:
+        y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
+    else:
+        y_ = y

Review comment:
       ```suggestion
       x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) if dm or dk 
else x
       y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk))) if dn or dk 
else y
   ```

##########
File path: python/tvm/topi/cuda/tensorcore_alter_op.py
##########
@@ -147,30 +155,37 @@ def _dense_legalize(attrs, inputs, arg_types):
             return None
 
         candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-        (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, 
candidates)
-
-        if extra_flops_ratio > 2:
-            logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio 
%s", extra_flops_ratio)
+    elif dtype in ["int4", "uint4"]:
+        if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
+            # no need to pad
             return None
-
-        logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", 
extra_flops_ratio)
-
-        if dm or dk:
-            x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
-        else:
-            x_ = x
-        if dn or dk:
-            y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
-        else:
-            y_ = y
-        out_ = relay.nn.dense(x_, y_)
-        if dm or dn:
-            original_out_shape = [x.value for x in output_tensor.shape]
-            out = relay.strided_slice(out_, begin=[0, 0], 
end=original_out_shape)
-        else:
-            out = out_
-        return out
-    return None
+        candidates = [(8, 32, 8)]
+    else:
+        return None
+
+    (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
+
+    if extra_flops_ratio > 2:
+        logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", 
extra_flops_ratio)
+        return None
+
+    logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", 
extra_flops_ratio)
+
+    if dm or dk:
+        x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
+    else:
+        x_ = x
+    if dn or dk:
+        y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
+    else:
+        y_ = y
+    out_ = relay.nn.dense(x_, y_, **new_attrs)
+    if dm or dn:
+        original_out_shape = [x.value for x in output_tensor.shape]
+        out = relay.strided_slice(out_, begin=[0, 0], end=original_out_shape)
+    else:
+        out = out_
+    return out

Review comment:
       ditto. Refine these lines.

##########
File path: python/tvm/topi/cuda/tensorcore_alter_op.py
##########
@@ -70,31 +70,38 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
         ):
             # no need to pad
             return None
-
         candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
-        (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
-
-        if extra_flops > 2:
-            logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops 
%s", extra_flops)
+    elif dtype in ["int4", "uint4"]:
+        if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
+            # no need to pad
             return None
 
-        logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", 
extra_flops)
-        if dm or dk:
-            x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
-        else:
-            x_ = x
-        if dn or dk:
-            y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
-        else:
-            y_ = y
-        out_ = relay.nn.batch_matmul(x_, y_)
-        if dm or dn:
-            original_out_shape = [x.value for x in output_tensor.shape]
-            out = relay.strided_slice(out_, begin=[0, 0, 0], 
end=original_out_shape)
-        else:
-            out = out_
-        return out
-    return None
+        candidates = [(8, 32, 8)]
+    else:
+        return None
+
+    (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
+
+    if extra_flops > 2:
+        logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", 
extra_flops)
+        return None
+
+    logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
+    if dm or dk:
+        x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+    else:
+        x_ = x
+    if dn or dk:
+        y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
+    else:
+        y_ = y
+    out_ = relay.nn.batch_matmul(x_, y_, attrs.out_dtype)
+    if dm or dn:
+        original_out_shape = [x.value for x in output_tensor.shape]
+        out = relay.strided_slice(out_, begin=[0, 0, 0], 
end=original_out_shape)
+    else:
+        out = out_
+    return out

Review comment:
       ```suggestion
       out = relay.strided_slice(out_, begin=[0, 0, 0], end=[x.value for x in 
output_tensor.shape]) if dm or dn else out_
       return out
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to