[GitHub] [tvm] Lunderberg commented on a change in pull request #10604: HVX scheduling and bench-marking of TE element-wise add

GitBox Thu, 24 Mar 2022 13:50:07 -0700


Lunderberg commented on a change in pull request #10604:
URL: https://github.com/apache/tvm/pull/10604#discussion_r834676059




##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')
+
+    br = benchmark_results_collection()
+
+    for dtype in ['int8',]:  # Hexagon v68 allows more dtypes, but we're 
sticking with v68 for now.
+        for sched_type in [1,2,]:
+            for mem_scope in [None, "global.vtcm"]:
+
+                # These numbers are fairly arbitrary, but they're meant to 
stress memory/caches to
+                # various extents.
+                for num_vectors_per_tensor in [1,16,64,512,2048]:
+
+                    version_name = 
'dtype:{}-schedtype:{}-memscope{}-numvecs:{}'.format(dtype, str(sched_type), 
str(mem_scope), num_vectors_per_tensor)
+                    print("CONFIGURATION: {}".format(version_name))
+
+                    # This is a fixed detail of the v68 architecture.
+                    HVX_VECTOR_BYTES=128
+
+                    dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+                    assert dtype_bits % 8 == 0
+                    dtype_bytes = dtype_bits // 8
+
+                    elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+                    # Note!  We're providing the complete input tensor shapes 
now,
+                    # whereas the original code only reveals the exact shape 
when
+                    # about to call the kernel.
+
+                    shape = [num_vectors_per_tensor, elem_per_hvx_vector,]
+
+                    A = tvm.te.placeholder(shape, dtype=dtype)
+                    B = tvm.te.placeholder(shape, dtype=dtype)
+                    C = tvm.te.compute(A.shape, lambda i,j: A[i,j] + B[i,j], 
name="C")
+
+                    sched = tvm.te.create_schedule(C.op)
+
+                    if sched_type == 1:
+                        pass
+                    elif sched_type == 2:
+                        sched[C].vectorize(C.op.axis[1])
+                    else:
+                        raise Exception("Unknown schedule type")
+
+                    # This module is only created so humans can inspect its IR.
+                    module_for_ir_dump = tvm.lower(sched, [A,B,C], "foo")
+
+                    report_path = os.path.join(host_output_dir, 
f'{version_name}.txt')
+
+                    with open(report_path, 'w') as f:
+                        f.write("LOWERED IR MODULE:\n")
+                        f.write(str(module_for_ir_dump))
+                        f.write('\n')
+
+                        target_hexagon = tvm.target.hexagon("v68", 
link_params=True)
+                        func = tvm.build(
+                            sched, [A, B, C], 
tvm.target.Target(target_hexagon, host=target_hexagon), name="add_hvx"
+                        )
+
+                        host_dso_binary_path = os.path.join(host_output_dir, 
f'test_binary-{version_name}.so')
+                        target_dso_binary_filename = 'test_binary.so'
+
+                        func.save(str(host_dso_binary_path))
+                        print("SAVED BINARY TO HOST PATH: 
{}".format(str(host_dso_binary_path)))
+
+                        if not android_serial_number:

Review comment:
       If there is no hardware connected, this check will skip everything 
remaining in the test.  Since this is specifically for hardware benchmarks, can 
we move this check to the top of the test?

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()

Review comment:
       Does the `host_output_dir` get cleaned up after a run?  If not, this 
could cause the temporary directories to balloon up.  Could we add a fixture 
with something like the following, so that the benchmark directory gets cleaned 
up unless an environment variable is set.
   
   ```python
   @pytest.fixture(scope="session")
   def tvm_benchmark_path():
       if 'TVM_BENCHMARK_PATH' in os.environ:
           yield os.environ['TVM_BENCHMARK_PATH']
       else:
           with tempfile.TemporaryDirectory(prefix='tvm_hexagon_benchmarks_') 
as temp_dir:
               yield temp_dir
   ```

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """

Review comment:
       Should this run benchmarks on the simulator as well as physical 
hardware?  If they should only run on physical hardware, can we add `if 
android_serial_number == 'simulator': pytest.skip()`?

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)

Review comment:
       Instead of storing the results in several lists, can we use either a 
list of tuples, or a list of dictionaries?

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')

Review comment:
       Rather than generating the csv file explicitly, would it be easier to 
use [python's stdlib 
`csv.DictWriter`](https://docs.python.org/3/library/csv.html#csv.DictWriter)?  
That is, `record_success` and `record_failure` would push to a list of 
dictionaries of results, then `dump` would loop over that list and call 
`writer.writerow(result)` for each item in the list.

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')
+
+    br = benchmark_results_collection()
+
+    for dtype in ['int8',]:  # Hexagon v68 allows more dtypes, but we're 
sticking with v68 for now.
+        for sched_type in [1,2,]:
+            for mem_scope in [None, "global.vtcm"]:
+
+                # These numbers are fairly arbitrary, but they're meant to 
stress memory/caches to
+                # various extents.
+                for num_vectors_per_tensor in [1,16,64,512,2048]:
+
+                    version_name = 
'dtype:{}-schedtype:{}-memscope{}-numvecs:{}'.format(dtype, str(sched_type), 
str(mem_scope), num_vectors_per_tensor)
+                    print("CONFIGURATION: {}".format(version_name))
+
+                    # This is a fixed detail of the v68 architecture.
+                    HVX_VECTOR_BYTES=128
+
+                    dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+                    assert dtype_bits % 8 == 0
+                    dtype_bytes = dtype_bits // 8
+
+                    elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+                    # Note!  We're providing the complete input tensor shapes 
now,
+                    # whereas the original code only reveals the exact shape 
when
+                    # about to call the kernel.
+
+                    shape = [num_vectors_per_tensor, elem_per_hvx_vector,]
+
+                    A = tvm.te.placeholder(shape, dtype=dtype)
+                    B = tvm.te.placeholder(shape, dtype=dtype)
+                    C = tvm.te.compute(A.shape, lambda i,j: A[i,j] + B[i,j], 
name="C")
+
+                    sched = tvm.te.create_schedule(C.op)
+
+                    if sched_type == 1:
+                        pass
+                    elif sched_type == 2:
+                        sched[C].vectorize(C.op.axis[1])
+                    else:
+                        raise Exception("Unknown schedule type")
+
+                    # This module is only created so humans can inspect its IR.
+                    module_for_ir_dump = tvm.lower(sched, [A,B,C], "foo")
+
+                    report_path = os.path.join(host_output_dir, 
f'{version_name}.txt')
+
+                    with open(report_path, 'w') as f:
+                        f.write("LOWERED IR MODULE:\n")
+                        f.write(str(module_for_ir_dump))
+                        f.write('\n')
+
+                        target_hexagon = tvm.target.hexagon("v68", 
link_params=True)
+                        func = tvm.build(
+                            sched, [A, B, C], 
tvm.target.Target(target_hexagon, host=target_hexagon), name="add_hvx"
+                        )
+
+                        host_dso_binary_path = os.path.join(host_output_dir, 
f'test_binary-{version_name}.so')
+                        target_dso_binary_filename = 'test_binary.so'
+
+                        func.save(str(host_dso_binary_path))
+                        print("SAVED BINARY TO HOST PATH: 
{}".format(str(host_dso_binary_path)))
+
+                        if not android_serial_number:
+                            pytest.skip(msg="Skip hardware test since 
ANDROID_SERIAL_NUMBER is not set.")
+
+                        rpc_info = {
+                            "rpc_tracker_host": tvm_tracker_host,
+                            "rpc_tracker_port": tvm_tracker_port,
+                            "rpc_server_port": RPC_SERVER_PORT + 0,  # See 
note at the beginning of the file
+                            "adb_server_socket": adb_server_socket,
+                        }
+                        launcher = 
HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+                        launcher.upload(str(host_dso_binary_path), 
str(target_dso_binary_filename))
+                        launcher.start_server()
+
+                        try:

Review comment:
       Do we know which sections are likely to fail, and can we narrow the 
scope of the `try` block to just those sections?

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')
+
+    br = benchmark_results_collection()
+
+    for dtype in ['int8',]:  # Hexagon v68 allows more dtypes, but we're 
sticking with v68 for now.
+        for sched_type in [1,2,]:
+            for mem_scope in [None, "global.vtcm"]:
+
+                # These numbers are fairly arbitrary, but they're meant to 
stress memory/caches to
+                # various extents.
+                for num_vectors_per_tensor in [1,16,64,512,2048]:
+
+                    version_name = 
'dtype:{}-schedtype:{}-memscope{}-numvecs:{}'.format(dtype, str(sched_type), 
str(mem_scope), num_vectors_per_tensor)
+                    print("CONFIGURATION: {}".format(version_name))
+
+                    # This is a fixed detail of the v68 architecture.
+                    HVX_VECTOR_BYTES=128
+
+                    dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+                    assert dtype_bits % 8 == 0
+                    dtype_bytes = dtype_bits // 8
+
+                    elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+                    # Note!  We're providing the complete input tensor shapes 
now,
+                    # whereas the original code only reveals the exact shape 
when
+                    # about to call the kernel.
+
+                    shape = [num_vectors_per_tensor, elem_per_hvx_vector,]
+
+                    A = tvm.te.placeholder(shape, dtype=dtype)
+                    B = tvm.te.placeholder(shape, dtype=dtype)
+                    C = tvm.te.compute(A.shape, lambda i,j: A[i,j] + B[i,j], 
name="C")
+
+                    sched = tvm.te.create_schedule(C.op)
+
+                    if sched_type == 1:
+                        pass
+                    elif sched_type == 2:
+                        sched[C].vectorize(C.op.axis[1])
+                    else:
+                        raise Exception("Unknown schedule type")
+
+                    # This module is only created so humans can inspect its IR.
+                    module_for_ir_dump = tvm.lower(sched, [A,B,C], "foo")
+
+                    report_path = os.path.join(host_output_dir, 
f'{version_name}.txt')
+
+                    with open(report_path, 'w') as f:
+                        f.write("LOWERED IR MODULE:\n")
+                        f.write(str(module_for_ir_dump))
+                        f.write('\n')
+
+                        target_hexagon = tvm.target.hexagon("v68", 
link_params=True)
+                        func = tvm.build(
+                            sched, [A, B, C], 
tvm.target.Target(target_hexagon, host=target_hexagon), name="add_hvx"
+                        )
+
+                        host_dso_binary_path = os.path.join(host_output_dir, 
f'test_binary-{version_name}.so')
+                        target_dso_binary_filename = 'test_binary.so'
+
+                        func.save(str(host_dso_binary_path))
+                        print("SAVED BINARY TO HOST PATH: 
{}".format(str(host_dso_binary_path)))
+
+                        if not android_serial_number:
+                            pytest.skip(msg="Skip hardware test since 
ANDROID_SERIAL_NUMBER is not set.")
+
+                        rpc_info = {
+                            "rpc_tracker_host": tvm_tracker_host,
+                            "rpc_tracker_port": tvm_tracker_port,
+                            "rpc_server_port": RPC_SERVER_PORT + 0,  # See 
note at the beginning of the file
+                            "adb_server_socket": adb_server_socket,
+                        }
+                        launcher = 
HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+                        launcher.upload(str(host_dso_binary_path), 
str(target_dso_binary_filename))
+                        launcher.start_server()
+
+                        try:
+                            with launcher.start_session() as sess:
+                                mod = 
launcher.load_module(target_dso_binary_filename, sess)
+
+                                host_numpy_A_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_B_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_C_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_C_data_expected = np.ndarray(shape, 
dtype=dtype)
+
+                                def intended_val_A(i,j):
+                                    return i + j
+
+                                def intended_val_B(i,j):
+                                    return (i+1) * (j+1)
+
+                                for i in range(shape[0]):

Review comment:
       Iteration over large arrays tends to be pretty slow in Python.  For 
numpy types, the iteration can be offloaded to compiled functions that operate 
on the entire array.
   
   ```python
   j,i = np.meshgrid(np.arange(shape[1]), np.arange(shape[0]))
   host_numpy_A_data = (i + j).astype(dtype)
   host_numpy_B_data = ((i+1) * (j+1)).astype(dtype)
   host_numpy_B_data_expected = host_numpy_A_data + host_numpy_B_data
   ```

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')
+
+    br = benchmark_results_collection()
+
+    for dtype in ['int8',]:  # Hexagon v68 allows more dtypes, but we're 
sticking with v68 for now.
+        for sched_type in [1,2,]:
+            for mem_scope in [None, "global.vtcm"]:
+
+                # These numbers are fairly arbitrary, but they're meant to 
stress memory/caches to
+                # various extents.
+                for num_vectors_per_tensor in [1,16,64,512,2048]:
+
+                    version_name = 
'dtype:{}-schedtype:{}-memscope{}-numvecs:{}'.format(dtype, str(sched_type), 
str(mem_scope), num_vectors_per_tensor)
+                    print("CONFIGURATION: {}".format(version_name))
+
+                    # This is a fixed detail of the v68 architecture.
+                    HVX_VECTOR_BYTES=128
+
+                    dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+                    assert dtype_bits % 8 == 0
+                    dtype_bytes = dtype_bits // 8
+
+                    elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+                    # Note!  We're providing the complete input tensor shapes 
now,
+                    # whereas the original code only reveals the exact shape 
when
+                    # about to call the kernel.
+
+                    shape = [num_vectors_per_tensor, elem_per_hvx_vector,]
+
+                    A = tvm.te.placeholder(shape, dtype=dtype)
+                    B = tvm.te.placeholder(shape, dtype=dtype)
+                    C = tvm.te.compute(A.shape, lambda i,j: A[i,j] + B[i,j], 
name="C")
+
+                    sched = tvm.te.create_schedule(C.op)
+
+                    if sched_type == 1:
+                        pass
+                    elif sched_type == 2:
+                        sched[C].vectorize(C.op.axis[1])
+                    else:
+                        raise Exception("Unknown schedule type")
+
+                    # This module is only created so humans can inspect its IR.
+                    module_for_ir_dump = tvm.lower(sched, [A,B,C], "foo")
+
+                    report_path = os.path.join(host_output_dir, 
f'{version_name}.txt')
+
+                    with open(report_path, 'w') as f:
+                        f.write("LOWERED IR MODULE:\n")
+                        f.write(str(module_for_ir_dump))
+                        f.write('\n')
+
+                        target_hexagon = tvm.target.hexagon("v68", 
link_params=True)
+                        func = tvm.build(
+                            sched, [A, B, C], 
tvm.target.Target(target_hexagon, host=target_hexagon), name="add_hvx"
+                        )
+
+                        host_dso_binary_path = os.path.join(host_output_dir, 
f'test_binary-{version_name}.so')
+                        target_dso_binary_filename = 'test_binary.so'
+
+                        func.save(str(host_dso_binary_path))
+                        print("SAVED BINARY TO HOST PATH: 
{}".format(str(host_dso_binary_path)))
+
+                        if not android_serial_number:
+                            pytest.skip(msg="Skip hardware test since 
ANDROID_SERIAL_NUMBER is not set.")
+
+                        rpc_info = {
+                            "rpc_tracker_host": tvm_tracker_host,
+                            "rpc_tracker_port": tvm_tracker_port,
+                            "rpc_server_port": RPC_SERVER_PORT + 0,  # See 
note at the beginning of the file
+                            "adb_server_socket": adb_server_socket,
+                        }
+                        launcher = 
HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+                        launcher.upload(str(host_dso_binary_path), 
str(target_dso_binary_filename))
+                        launcher.start_server()
+
+                        try:
+                            with launcher.start_session() as sess:
+                                mod = 
launcher.load_module(target_dso_binary_filename, sess)
+
+                                host_numpy_A_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_B_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_C_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_C_data_expected = np.ndarray(shape, 
dtype=dtype)
+
+                                def intended_val_A(i,j):
+                                    return i + j
+
+                                def intended_val_B(i,j):
+                                    return (i+1) * (j+1)
+
+                                for i in range(shape[0]):
+                                    for j in range(shape[1]):
+                                        host_numpy_A_data[i,j] = 
intended_val_A(i,j)
+                                        host_numpy_B_data[i,j] = 
intended_val_B(i,j)
+                                        host_numpy_C_data_expected[i,j] = 
intended_val_A(i,j) + intended_val_B(i,j)
+
+                                A_data = tvm.nd.empty(shape, dtype, 
sess.device, mem_scope)
+                                A_data.copyfrom(host_numpy_A_data)
+
+                                B_data = tvm.nd.empty(shape, dtype, 
sess.device, mem_scope)
+                                B_data.copyfrom(host_numpy_B_data)
+
+                                C_data = tvm.nd.empty(shape, dtype, 
sess.device, mem_scope)
+
+                                timer = mod.time_evaluator("add_hvx", 
sess.device, number=100, repeat=1, min_repeat_ms=1000)
+                                timing_result = timer(A_data, B_data, C_data)
+
+                                print("TIMING RESULT: 
{}".format(timing_result))
+
+                                # Verify that the computation actually 
happened, and produced the correct result.
+                                result = C_data.numpy()
+                                assert (result == 
host_numpy_C_data_expected).all()
+
+                                br.record_success(dtype, sched_type, 
mem_scope, num_vectors_per_tensor, timing_result)
+                        except:

Review comment:
       Prefer `except Exception` rather than a bare `except`.  A bare `except` 
will catch much more, including 
[`KeyboardInterrupt`](https://docs.python.org/3/library/exceptions.html#KeyboardInterrupt),
 making it impossible to Ctrl-C out of the entire script.

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')
+
+    br = benchmark_results_collection()
+
+    for dtype in ['int8',]:  # Hexagon v68 allows more dtypes, but we're 
sticking with v68 for now.
+        for sched_type in [1,2,]:
+            for mem_scope in [None, "global.vtcm"]:
+
+                # These numbers are fairly arbitrary, but they're meant to 
stress memory/caches to
+                # various extents.
+                for num_vectors_per_tensor in [1,16,64,512,2048]:
+
+                    version_name = 
'dtype:{}-schedtype:{}-memscope{}-numvecs:{}'.format(dtype, str(sched_type), 
str(mem_scope), num_vectors_per_tensor)
+                    print("CONFIGURATION: {}".format(version_name))
+
+                    # This is a fixed detail of the v68 architecture.
+                    HVX_VECTOR_BYTES=128
+
+                    dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+                    assert dtype_bits % 8 == 0
+                    dtype_bytes = dtype_bits // 8
+
+                    elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+                    # Note!  We're providing the complete input tensor shapes 
now,
+                    # whereas the original code only reveals the exact shape 
when
+                    # about to call the kernel.
+
+                    shape = [num_vectors_per_tensor, elem_per_hvx_vector,]
+
+                    A = tvm.te.placeholder(shape, dtype=dtype)
+                    B = tvm.te.placeholder(shape, dtype=dtype)
+                    C = tvm.te.compute(A.shape, lambda i,j: A[i,j] + B[i,j], 
name="C")
+
+                    sched = tvm.te.create_schedule(C.op)
+
+                    if sched_type == 1:
+                        pass
+                    elif sched_type == 2:
+                        sched[C].vectorize(C.op.axis[1])
+                    else:
+                        raise Exception("Unknown schedule type")
+
+                    # This module is only created so humans can inspect its IR.
+                    module_for_ir_dump = tvm.lower(sched, [A,B,C], "foo")
+
+                    report_path = os.path.join(host_output_dir, 
f'{version_name}.txt')
+
+                    with open(report_path, 'w') as f:
+                        f.write("LOWERED IR MODULE:\n")
+                        f.write(str(module_for_ir_dump))
+                        f.write('\n')
+
+                        target_hexagon = tvm.target.hexagon("v68", 
link_params=True)
+                        func = tvm.build(
+                            sched, [A, B, C], 
tvm.target.Target(target_hexagon, host=target_hexagon), name="add_hvx"
+                        )
+
+                        host_dso_binary_path = os.path.join(host_output_dir, 
f'test_binary-{version_name}.so')
+                        target_dso_binary_filename = 'test_binary.so'
+
+                        func.save(str(host_dso_binary_path))
+                        print("SAVED BINARY TO HOST PATH: 
{}".format(str(host_dso_binary_path)))
+
+                        if not android_serial_number:
+                            pytest.skip(msg="Skip hardware test since 
ANDROID_SERIAL_NUMBER is not set.")
+
+                        rpc_info = {
+                            "rpc_tracker_host": tvm_tracker_host,
+                            "rpc_tracker_port": tvm_tracker_port,
+                            "rpc_server_port": RPC_SERVER_PORT + 0,  # See 
note at the beginning of the file
+                            "adb_server_socket": adb_server_socket,
+                        }
+                        launcher = 
HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+                        launcher.upload(str(host_dso_binary_path), 
str(target_dso_binary_filename))
+                        launcher.start_server()
+
+                        try:
+                            with launcher.start_session() as sess:
+                                mod = 
launcher.load_module(target_dso_binary_filename, sess)
+
+                                host_numpy_A_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_B_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_C_data = np.ndarray(shape, 
dtype=dtype)
+                                host_numpy_C_data_expected = np.ndarray(shape, 
dtype=dtype)
+
+                                def intended_val_A(i,j):
+                                    return i + j
+
+                                def intended_val_B(i,j):
+                                    return (i+1) * (j+1)
+
+                                for i in range(shape[0]):
+                                    for j in range(shape[1]):
+                                        host_numpy_A_data[i,j] = 
intended_val_A(i,j)
+                                        host_numpy_B_data[i,j] = 
intended_val_B(i,j)
+                                        host_numpy_C_data_expected[i,j] = 
intended_val_A(i,j) + intended_val_B(i,j)
+
+                                A_data = tvm.nd.empty(shape, dtype, 
sess.device, mem_scope)
+                                A_data.copyfrom(host_numpy_A_data)
+
+                                B_data = tvm.nd.empty(shape, dtype, 
sess.device, mem_scope)
+                                B_data.copyfrom(host_numpy_B_data)
+
+                                C_data = tvm.nd.empty(shape, dtype, 
sess.device, mem_scope)
+
+                                timer = mod.time_evaluator("add_hvx", 
sess.device, number=100, repeat=1, min_repeat_ms=1000)
+                                timing_result = timer(A_data, B_data, C_data)
+
+                                print("TIMING RESULT: 
{}".format(timing_result))
+
+                                # Verify that the computation actually 
happened, and produced the correct result.
+                                result = C_data.numpy()
+                                assert (result == 
host_numpy_C_data_expected).all()

Review comment:
       Prefer utility function `tvm.testing.assert_allclose`, which gives a 
summary of discrepencies between expected and observed arrays.

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')
+
+    br = benchmark_results_collection()
+
+    for dtype in ['int8',]:  # Hexagon v68 allows more dtypes, but we're 
sticking with v68 for now.
+        for sched_type in [1,2,]:
+            for mem_scope in [None, "global.vtcm"]:

Review comment:
       Instead of `None`, can we explicitly specify `"global"` memory scope?  
It's the default memory scope, but that makes it clear what the alternative to 
`"global.vtcm"` is.

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')

Review comment:
       Nitpick: The format strings aren't necessary here.  Can also reduce 
repetition with `str.join`.
   
   ```
   column_names = [...]
   f.write(delim.join(column_names))
   ```

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):
+                f.write('{}'.format(self.dtypes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.sched_types[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.mem_scopes[i]))
+
+                f.write(delim)
+                f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+                r = self.benchmark_results[i]
+                ft = self.failure_texts[i]
+
+                if r is None:
+                    f.write(delim)
+                    f.write(delim)
+                    f.write(delim)
+                else:
+                    median_usec = r.median * 1000000
+                    min_usec = r.min * 1000000
+                    max_usec = r.max * 1000000
+
+                    f.write(delim)
+                    f.write(f'{median_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{min_usec:.3}')
+
+                    f.write(delim)
+                    f.write(f'{max_usec:.3}')
+
+                if ft is None:
+                    f.write(delim)
+                    f.write('OK')
+                else:
+                    f.write(delim)
+                    f.write(f'FAILURE: {ft}')
+
+                f.write('\n')
+
+    br = benchmark_results_collection()
+
+    for dtype in ['int8',]:  # Hexagon v68 allows more dtypes, but we're 
sticking with v68 for now.

Review comment:
       If the benchmark collection is moved into a session-scoped fixture, 
these can be pulled out into parameters that are accepted by the test function.
   
   ```python
   dtype = tvm.testing.parameter('int8')
   sched_type = tvm.testing.parameter(1,2)
   ...
   ```

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain

Review comment:
       Can we move this into a separate file for organization?  That would keep 
the benchmarks separate from the functionality tests.

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:

Review comment:
       Can this be pulled out as a global class definition, then exposed as a 
fixture?  That would allow it to be used/aggregated by all tests, rather than 
including them all in a single test function.  I'm picturing a session-scope 
fixture, something like the following.
   
   ```python
   class BenchmarkResults:
       # Same implementation goes here
       ...
   
   @pytest.fixture(scope="session")
   def benchmark_results(benchmark_output_dir):
       results = BenchmarkResults()
       yield results
       
       tabular_output_filename = os.path.join(host_output_dir, 
'benchmark-results.csv')
       with open(tabular_output_filename, 'w') as csv_file:
           results.dump(csv_file)
   ```

##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,251 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port, 
adb_server_socket):
+    """
+    Starting with an elementwise-add computation, try various schedules / 
optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between 
these
+    schedules / optimizations vs. how effectively the primfunc uses the 
Hexagon's
+    HVX units.
+    """
+
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-"*80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-"*80)
+    print()
+
+    class benchmark_results_collection:
+        def __init__(self):
+            # We'll store the results in corresponding arrays, for simplicity.
+            self.dtypes = []
+            self.sched_types = []
+            self.mem_scopes = []
+            self.nums_vecs_per_tensor = []
+            self.benchmark_results = []
+            self.failure_texts = []
+
+        def record_success(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, benchmark_result):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(benchmark_result)
+            self.failure_texts.append(None)
+
+        def record_failure(self, dtype, sched_type, mem_scope, 
num_vecs_per_tensor, outcome):
+            self.dtypes.append(dtype)
+            self.sched_types.append(sched_type)
+            self.mem_scopes.append(mem_scope)
+            self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+            self.benchmark_results.append(None)
+            self.failure_texts.append(outcome)
+
+
+        def dump(self, f):
+            delim = '\t'
+
+            f.write(f'dtype')
+
+            f.write(delim)
+            f.write(f'sched_type')
+
+            f.write(delim)
+            f.write(f'mem_scope')
+
+            f.write(delim)
+            f.write(f'num_vecs_per_tensor')
+
+            f.write(delim)
+            f.write(f'median(µsec)')
+
+            f.write(delim)
+            f.write(f'min(µsec)')
+
+            f.write(delim)
+            f.write(f'max(µsec)')
+
+            f.write(delim)
+            f.write(f'comment')
+
+            f.write('\n')
+
+            for i in range(len(self.dtypes)):

Review comment:
       Prefer iterating over a collection, rather than a range.
   
   ```python
   for i,(dtype,sched_type,mem_scope,num_vecs_per_tensor) in enumerate(zip(
           self.dtypes,
           self.sched_types,
           self.mem_scopes,
           self.num_vecs_per_tensor)):
   ```
   
   Alternatively, if the results are stored as a list of tuples, it becomes 
simpler to loop over, and simpler to write to the output.
   
   ```python
   for i,result in enumerate(self.results):
       f.write(delim.join(map(str,result)))
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] Lunderberg commented on a change in pull request #10604: HVX scheduling and bench-marking of TE element-wise add

Reply via email to