cconvey commented on a change in pull request #10604:
URL: https://github.com/apache/tvm/pull/10604#discussion_r836671069
##########
File path: tests/python/contrib/test_hexagon/test_launcher.py
##########
@@ -40,6 +42,256 @@
# triggering TIME_WAIT state on the server socket. This prevents another
# server to bind to the same port until the wait time elapses.
+@requires_hexagon_toolchain
+def test_add_hvx(android_serial_number, tvm_tracker_host, tvm_tracker_port,
adb_server_socket):
+ """
+ Starting with an elementwise-add computation, try various schedules /
optimizations to
+ see the impact they have on performance.
+
+ The main motivation for this test is to explore the relationship between
these
+ schedules / optimizations vs. how effectively the primfunc uses the
Hexagon's
+ HVX units.
+ """
+
+ host_output_dir = tempfile.mkdtemp()
+
+ print("-"*80)
+ print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+ print("-"*80)
+ print()
+
+ class benchmark_results_collection:
+ def __init__(self):
+ # We'll store the results in corresponding arrays, for simplicity.
+ self.dtypes = []
+ self.sched_types = []
+ self.mem_scopes = []
+ self.nums_vecs_per_tensor = []
+ self.benchmark_results = []
+ self.failure_texts = []
+
+ def record_success(self, dtype, sched_type, mem_scope,
num_vecs_per_tensor, benchmark_result):
+ self.dtypes.append(dtype)
+ self.sched_types.append(sched_type)
+ self.mem_scopes.append(mem_scope)
+ self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+ self.benchmark_results.append(benchmark_result)
+ self.failure_texts.append(None)
+
+ def record_failure(self, dtype, sched_type, mem_scope,
num_vecs_per_tensor, outcome):
+ self.dtypes.append(dtype)
+ self.sched_types.append(sched_type)
+ self.mem_scopes.append(mem_scope)
+ self.nums_vecs_per_tensor.append(num_vecs_per_tensor)
+ self.benchmark_results.append(None)
+ self.failure_texts.append(outcome)
+
+
+ def dump(self, f):
+ delim = '\t'
+
+ f.write(f'dtype')
+
+ f.write(delim)
+ f.write(f'sched_type')
+
+ f.write(delim)
+ f.write(f'mem_scope')
+
+ f.write(delim)
+ f.write(f'num_vecs_per_tensor')
+
+ f.write(delim)
+ f.write(f'median(µsec)')
+
+ f.write(delim)
+ f.write(f'min(µsec)')
+
+ f.write(delim)
+ f.write(f'max(µsec)')
+
+ f.write(delim)
+ f.write(f'comment')
+
+ f.write('\n')
+
+ for i in range(len(self.dtypes)):
+ f.write('{}'.format(self.dtypes[i]))
+
+ f.write(delim)
+ f.write('{}'.format(self.sched_types[i]))
+
+ f.write(delim)
+ f.write('{}'.format(self.mem_scopes[i]))
+
+ f.write(delim)
+ f.write('{}'.format(self.nums_vecs_per_tensor[i]))
+
+ r = self.benchmark_results[i]
+ ft = self.failure_texts[i]
+
+ if r is None:
+ f.write(delim)
+ f.write(delim)
+ f.write(delim)
+ else:
+ median_usec = r.median * 1000000
+ min_usec = r.min * 1000000
+ max_usec = r.max * 1000000
+
+ f.write(delim)
+ f.write(f'{median_usec:.3}')
+
+ f.write(delim)
+ f.write(f'{min_usec:.3}')
+
+ f.write(delim)
+ f.write(f'{max_usec:.3}')
+
+ if ft is None:
+ f.write(delim)
+ f.write('OK')
+ else:
+ f.write(delim)
+ f.write(f'FAILURE: {ft}')
+
+ f.write('\n')
+
+ br = benchmark_results_collection()
+
+ # Hexagon v69 allows more dtypes, but we're sticking with v68 for now.
+ for dtype in ['int8',]:
+
+ # These numbers are only meaningful in the context of this script.
+ for sched_type in [1,2,]:
+
+ for mem_scope in ["global", "global.vtcm"]:
+
+ # These numbers are fairly arbitrary, but they're meant to
stress memory/caches to
+ # various extents.
+ for num_vectors_per_tensor in [1,16,64,512,2048]:
+
+ version_name =
'dtype:{}-schedtype:{}-memscope{}-numvecs:{}'.format(dtype, str(sched_type),
str(mem_scope), num_vectors_per_tensor)
+ print("CONFIGURATION: {}".format(version_name))
+
+ # This is a fixed detail of the v68 architecture.
+ HVX_VECTOR_BYTES=128
+
+ dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+ assert dtype_bits % 8 == 0
+ dtype_bytes = dtype_bits // 8
+
+ elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+ # Note! We're providing the complete input tensor shapes
now,
+ # whereas the original code only reveals the exact shape
when
+ # about to call the kernel.
+
+ shape = [num_vectors_per_tensor, elem_per_hvx_vector,]
+
+ A = tvm.te.placeholder(shape, dtype=dtype)
+ B = tvm.te.placeholder(shape, dtype=dtype)
+ C = tvm.te.compute(A.shape, lambda i,j: A[i,j] + B[i,j],
name="C")
+
+ sched = tvm.te.create_schedule(C.op)
+
+ if sched_type == 1:
+ pass
+ elif sched_type == 2:
+ sched[C].vectorize(C.op.axis[1])
+ else:
+ raise Exception("Unknown schedule type")
+
+ # This module is only created so humans can inspect its IR.
+ module_for_ir_dump = tvm.lower(sched, [A,B,C], "foo")
+
+ report_path = os.path.join(host_output_dir,
f'{version_name}.txt')
+
+ with open(report_path, 'w') as f:
+ f.write("LOWERED IR MODULE:\n")
+ f.write(str(module_for_ir_dump))
+ f.write('\n')
+
+ target_hexagon = tvm.target.hexagon("v68",
link_params=True)
+ func = tvm.build(
+ sched, [A, B, C],
tvm.target.Target(target_hexagon, host=target_hexagon), name="add_hvx"
+ )
+
+ host_dso_binary_path = os.path.join(host_output_dir,
f'test_binary-{version_name}.so')
+ target_dso_binary_filename = 'test_binary.so'
+
+ func.save(str(host_dso_binary_path))
+ print("SAVED BINARY TO HOST PATH:
{}".format(str(host_dso_binary_path)))
+
+ if not android_serial_number:
+ pytest.skip(msg="Skip hardware test since
ANDROID_SERIAL_NUMBER is not set.")
+
+ rpc_info = {
+ "rpc_tracker_host": tvm_tracker_host,
+ "rpc_tracker_port": tvm_tracker_port,
+ "rpc_server_port": RPC_SERVER_PORT + 0, # See
note at the beginning of the file
+ "adb_server_socket": adb_server_socket,
+ }
+ launcher =
HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+ launcher.upload(host_dso_binary_path,
target_dso_binary_filename)
+ launcher.start_server()
+
+ try:
+ with launcher.start_session() as sess:
+ mod =
launcher.load_module(target_dso_binary_filename, sess)
+
+ host_numpy_A_data = np.ndarray(shape,
dtype=dtype)
+ host_numpy_B_data = np.ndarray(shape,
dtype=dtype)
+ host_numpy_C_data = np.ndarray(shape,
dtype=dtype)
+ host_numpy_C_data_expected = np.ndarray(shape,
dtype=dtype)
+
+ def intended_val_A(i,j):
+ return i + j
+
+ def intended_val_B(i,j):
+ return (i+1) * (j+1)
+
+ for i in range(shape[0]):
+ for j in range(shape[1]):
+ host_numpy_A_data[i,j] =
intended_val_A(i,j)
+ host_numpy_B_data[i,j] =
intended_val_B(i,j)
+ host_numpy_C_data_expected[i,j] =
intended_val_A(i,j) + intended_val_B(i,j)
+
+ A_data = tvm.nd.empty(shape, dtype,
sess.device, mem_scope)
+ A_data.copyfrom(host_numpy_A_data)
+
+ B_data = tvm.nd.empty(shape, dtype,
sess.device, mem_scope)
+ B_data.copyfrom(host_numpy_B_data)
+
+ C_data = tvm.nd.empty(shape, dtype,
sess.device, mem_scope)
+
+ timer = mod.time_evaluator("add_hvx",
sess.device, number=100, repeat=1, min_repeat_ms=1000)
Review comment:
TODO: tweak the numbers here, especially if using HexagonSimulator where
excessive runs are painfully slow.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]