This is an automated email from the ASF dual-hosted git repository.
junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new bb2cec1941 [Fix,MetaSchedule] Correct log usage in arithmetic
intensity feature (#12806)
bb2cec1941 is described below
commit bb2cec1941db6f5a67b85550faa9279c4c14e2a3
Author: Tristan Konolige <[email protected]>
AuthorDate: Thu Oct 6 17:05:05 2022 -0700
[Fix,MetaSchedule] Correct log usage in arithmetic intensity feature
(#12806)
In meta schedule's featurization, arithmetic intensity was incorrectly
calculated as log(FLOPs) / log(bytes). This change corrects it to
log(FLOPs/bytes). Note that this is the same issue as in #12079.
---
.../feature_extractor/per_store_feature.cc | 21 +++++++---
...schedule_feature_extractor_per_store_feature.py | 46 ++++++++++++----------
2 files changed, 42 insertions(+), 25 deletions(-)
diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc
b/src/meta_schedule/feature_extractor/per_store_feature.cc
index 698de010b7..422f21abe1 100644
--- a/src/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/meta_schedule/feature_extractor/per_store_feature.cc
@@ -1042,6 +1042,17 @@ struct Feature {
/*!
* \brief See the wiki page [1] for details
*
+ * Arithmetic intensity is FLOPs/unique bytes of memory touched. A value is
computed
+ * for each set of loop nests starting with just the innermost loop and
+ * reaching to include all loops. There are a variable number of loops, so
+ * n_samples are taken from the curve of arithmetic intensity vs flops. This
+ * biases the values towards larger loops.
+ *
+ * Note that the denominator is unique bytes of memory touched. Repeated
+ * access to the same byte of memory counts as only a single byte touched.
+ *
+ * Values are scaled by log2(x + 1).
+ *
* [1] https://en.wikipedia.org/wiki/Roofline_model
*/
std::vector<double> arith_intensity_curve;
@@ -1060,7 +1071,7 @@ struct Feature {
std::vector<double> memory_bytes;
memory_bytes.resize(n_loops);
for (int i = 0; i < n_loops; ++i) {
- memory_bytes[n_loops - 1 - i] = std::log2(for_touched_bytes[i]);
+ memory_bytes[n_loops - 1 - i] = for_touched_bytes[i];
}
// Calculate `compute_ops` and `cur_compute_ops`
std::vector<double> compute_ops;
@@ -1072,7 +1083,7 @@ struct Feature {
if (const int64_t* extent = GetLoopIntExtent(loops[i])) {
total_compute_ops *= *extent;
}
- compute_ops.push_back(std::log2(total_compute_ops));
+ compute_ops.push_back(total_compute_ops);
}
// Fill the feature set
if (total_compute_ops <= 0 || compute_ops.empty()) {
@@ -1081,7 +1092,7 @@ struct Feature {
}
return;
}
- total_compute_ops = compute_ops.back(); // i.e. total_compute_ops =
log2(total_compute_ops)
+ total_compute_ops = compute_ops.back();
int p = 0;
for (int i = 0; i < n_samples; ++i) {
double& result = arith_intensity_curve[i];
@@ -1094,13 +1105,13 @@ struct Feature {
}
CHECK_LT(p, n_loops);
if (p == 0) {
- result = compute_ops[p] / memory_bytes[p];
+ result = slog(compute_ops[p] / memory_bytes[p]);
} else {
double base = compute_ops[p - 1] / memory_bytes[p - 1];
double slope =
(compute_ops[p] / memory_bytes[p] - compute_ops[p - 1] /
memory_bytes[p - 1]) /
(compute_ops[p] - compute_ops[p - 1]);
- result = base + slope * (cur_compute_ops - compute_ops[p - 1]);
+ result = slog(base + slope * (cur_compute_ops - compute_ops[p - 1]));
}
}
}
diff --git
a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
index cad140b8de..701e1826b3 100644
---
a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
+++
b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
@@ -365,19 +365,22 @@ def test_cpu_matmul():
atol=1e-5,
)
# Group 3: Arithmetic intensity
+ # arithmetic intensity = flops/bytes touched = 2*512*512*512/(3 * 4 *
512*512)
+ # add and multiply ^ 3 arrays ^ ^ 4
bytes per f32
+ # = 85.3 but log2 is used so values should be around 6.4
assert_allclose(
actual=f[147:157],
desired=[
- 0.7097842693328857,
- 0.7408391237258911,
- 0.8750449419021606,
- 0.9449487924575806,
- 1.0148526430130005,
- 1.0847564935684204,
- 1.113688349723816,
- 1.1394684314727783,
- 1.2119636535644531,
- 1.2971993684768677,
+ 3.812599,
+ 4.464822,
+ 4.912349,
+ 5.253426,
+ 5.529086,
+ 5.76043,
+ 5.959752,
+ 6.134849,
+ 6.290977,
+ 6.431846,
],
rtol=1e-5,
atol=1e-5,
@@ -1357,19 +1360,22 @@ def test_gpu():
atol=1e-5,
)
# Group 3: Arithmetic intensity
+ # Arithmetic intensity is high here because of repeated use of a shared
+ # buffer. Multiple accesses to the same memory location are counted as a
+ # single byte, skewing these numbers towards higher intensity.
assert_allclose(
actual=f[147:157],
desired=[
- 0.7097842504665767,
- 0.7548801745187567,
- 0.8775907547541741,
- 0.9957389916154509,
- 1.2446737395193135,
- 1.493608487423176,
- 1.7093103019954263,
- 1.8031580276850985,
- 1.9841832691827785,
- 2.204648076869754,
+ 11.98533,
+ 12.977811,
+ 13.562714,
+ 13.977722,
+ 14.299632,
+ 14.562654,
+ 14.785038,
+ 14.977677,
+ 15.147597,
+ 15.299596,
],
rtol=1e-5,
atol=1e-5,