jcf94 commented on a change in pull request #8402:
URL: https://github.com/apache/tvm/pull/8402#discussion_r665286811
##########
File path: python/tvm/topi/cuda/dense_tensorcore.py
##########
@@ -127,33 +131,36 @@ def _schedule_dense_tensorcore(cfg, s, C):
cfg.define_knob("offsetCS", [0, 8])
cfg.define_knob("vec", [1, 2, 4, 8])
- # Ensure that the default parameters are applicable when autotvm is not in
use
- if batch % 32 == 0 and out_dim % 8 == 0:
- cfg.define_knob("wmma_m", [32, 16, 8])
- elif batch % 16 == 0 and out_dim % 16 == 0:
- cfg.define_knob("wmma_m", [16, 8, 32])
- elif batch % 8 == 0 and out_dim % 32 == 0:
- cfg.define_knob("wmma_m", [8, 16, 32])
+ if data_dtype in ["float16", "int8", "uint8"]:
+ # Ensure that the default parameters are applicable when autotvm is
not in use
+ if batch % 32 == 0 and out_dim % 8 == 0:
+ cfg.define_knob("wmma_m", [32, 16, 8])
+ elif batch % 16 == 0 and out_dim % 16 == 0:
+ cfg.define_knob("wmma_m", [16, 8, 32])
+ elif batch % 8 == 0 and out_dim % 32 == 0:
+ cfg.define_knob("wmma_m", [8, 16, 32])
+ wmma_k = 16
+ wmma_m = cfg["wmma_m"].val
+ if wmma_m == 16:
+ wmma_n = 16
+ elif wmma_m == 8:
+ wmma_n = 32
+ elif wmma_m == 32:
+ wmma_n = 8
+ else:
Review comment:
ditto.
##########
File path: python/tvm/topi/cuda/batch_matmul_tensorcore.py
##########
@@ -94,32 +92,35 @@ def _schedule(cfg, s, C):
cfg.define_knob("vec", [1, 2, 4, 8])
# Ensure that the default parameters are applicable when autotvm is
not in use
- if m_dim % 32 == 0 and n_dim % 8 == 0:
- cfg.define_knob("wmma_m", [32, 16, 8])
- elif m_dim % 16 == 0 and n_dim % 16 == 0:
- cfg.define_knob("wmma_m", [16, 8, 32])
- elif m_dim % 8 == 0 and n_dim % 32 == 0:
- cfg.define_knob("wmma_m", [8, 16, 32])
+ if data_dtype in ["float16", "uint8", "int8"]:
+ if m_dim % 32 == 0 and n_dim % 8 == 0:
+ cfg.define_knob("wmma_m", [32, 16, 8])
+ elif m_dim % 16 == 0 and n_dim % 16 == 0:
+ cfg.define_knob("wmma_m", [16, 8, 32])
+ elif m_dim % 8 == 0 and n_dim % 32 == 0:
+ cfg.define_knob("wmma_m", [8, 16, 32])
+ wmma_k = 16
+ wmma_m = cfg["wmma_m"].val
+ if wmma_m == 16:
+ wmma_n = 16
+ elif wmma_m == 8:
+ wmma_n = 32
+ elif wmma_m == 32:
+ wmma_n = 8
+ else:
Review comment:
Does the else branch for int4?
Even through we can assume the op strategy has already done the type check,
still suggest better to specify it clearly in the code, then add an extra else
branch to raise warnning.
##########
File path: python/tvm/topi/cuda/tensorcore_alter_op.py
##########
@@ -70,31 +70,38 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
):
# no need to pad
return None
-
candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
- (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
-
- if extra_flops > 2:
- logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops
%s", extra_flops)
+ elif dtype in ["int4", "uint4"]:
+ if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
+ # no need to pad
return None
- logger.info("batch_matmul pad_to_tensorcore, extra_flops %s",
extra_flops)
- if dm or dk:
- x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
- else:
- x_ = x
- if dn or dk:
- y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
- else:
- y_ = y
- out_ = relay.nn.batch_matmul(x_, y_)
- if dm or dn:
- original_out_shape = [x.value for x in output_tensor.shape]
- out = relay.strided_slice(out_, begin=[0, 0, 0],
end=original_out_shape)
- else:
- out = out_
- return out
- return None
+ candidates = [(8, 32, 8)]
+ else:
+ return None
+
+ (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
+
+ if extra_flops > 2:
+ logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s",
extra_flops)
+ return None
+
+ logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
+ if dm or dk:
+ x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+ else:
+ x_ = x
+ if dn or dk:
+ y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
+ else:
+ y_ = y
Review comment:
```suggestion
x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) if dm or dk
else x
y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk))) if dn or dk
else y
```
##########
File path: python/tvm/topi/cuda/tensorcore_alter_op.py
##########
@@ -147,30 +155,37 @@ def _dense_legalize(attrs, inputs, arg_types):
return None
candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
- (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N,
candidates)
-
- if extra_flops_ratio > 2:
- logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio
%s", extra_flops_ratio)
+ elif dtype in ["int4", "uint4"]:
+ if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
+ # no need to pad
return None
-
- logger.info("dense pad_to_tensorcore, extra_flops_ratio %s",
extra_flops_ratio)
-
- if dm or dk:
- x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
- else:
- x_ = x
- if dn or dk:
- y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
- else:
- y_ = y
- out_ = relay.nn.dense(x_, y_)
- if dm or dn:
- original_out_shape = [x.value for x in output_tensor.shape]
- out = relay.strided_slice(out_, begin=[0, 0],
end=original_out_shape)
- else:
- out = out_
- return out
- return None
+ candidates = [(8, 32, 8)]
+ else:
+ return None
+
+ (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
+
+ if extra_flops_ratio > 2:
+ logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s",
extra_flops_ratio)
+ return None
+
+ logger.info("dense pad_to_tensorcore, extra_flops_ratio %s",
extra_flops_ratio)
+
+ if dm or dk:
+ x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk)))
+ else:
+ x_ = x
+ if dn or dk:
+ y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk)))
+ else:
+ y_ = y
+ out_ = relay.nn.dense(x_, y_, **new_attrs)
+ if dm or dn:
+ original_out_shape = [x.value for x in output_tensor.shape]
+ out = relay.strided_slice(out_, begin=[0, 0], end=original_out_shape)
+ else:
+ out = out_
+ return out
Review comment:
ditto. Refine these lines.
##########
File path: python/tvm/topi/cuda/tensorcore_alter_op.py
##########
@@ -70,31 +70,38 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
):
# no need to pad
return None
-
candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)]
- (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
-
- if extra_flops > 2:
- logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops
%s", extra_flops)
+ elif dtype in ["int4", "uint4"]:
+ if M % 8 == 0 and K % 32 == 0 and N % 8 == 0:
+ # no need to pad
return None
- logger.info("batch_matmul pad_to_tensorcore, extra_flops %s",
extra_flops)
- if dm or dk:
- x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
- else:
- x_ = x
- if dn or dk:
- y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
- else:
- y_ = y
- out_ = relay.nn.batch_matmul(x_, y_)
- if dm or dn:
- original_out_shape = [x.value for x in output_tensor.shape]
- out = relay.strided_slice(out_, begin=[0, 0, 0],
end=original_out_shape)
- else:
- out = out_
- return out
- return None
+ candidates = [(8, 32, 8)]
+ else:
+ return None
+
+ (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N, candidates)
+
+ if extra_flops > 2:
+ logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s",
extra_flops)
+ return None
+
+ logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops)
+ if dm or dk:
+ x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk)))
+ else:
+ x_ = x
+ if dn or dk:
+ y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk)))
+ else:
+ y_ = y
+ out_ = relay.nn.batch_matmul(x_, y_, attrs.out_dtype)
+ if dm or dn:
+ original_out_shape = [x.value for x in output_tensor.shape]
+ out = relay.strided_slice(out_, begin=[0, 0, 0],
end=original_out_shape)
+ else:
+ out = out_
+ return out
Review comment:
```suggestion
out = relay.strided_slice(out_, begin=[0, 0, 0], end=[x.value for x in
output_tensor.shape]) if dm or dn else out_
return out
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]