csullivan commented on code in PR #10878:
URL: https://github.com/apache/tvm/pull/10878#discussion_r842912665
##########
src/runtime/hexagon/hexagon/hexagon_buffer.cc:
##########
@@ -205,73 +205,116 @@ void HexagonBuffer::SetStorageScope(Optional<String>
scope) {
}
}
-void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
- CHECK_LE(nbytes, TotalBytes());
- CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged
`external` allocations");
+struct BufferSet {
+ BufferSet(void* const* buffers, size_t num_regions, size_t region_size_bytes)
+ : buffers(buffers), num_regions(num_regions),
region_size_bytes(region_size_bytes) {}
+
+ size_t TotalBytes() const { return num_regions * region_size_bytes; }
+
+ void* const* buffers;
+ size_t num_regions;
+ size_t region_size_bytes;
+};
+
+struct MemoryCopy {
+ MemoryCopy(void* dest, void* src, size_t num_bytes)
+ : dest(dest), src(src), num_bytes(num_bytes) {}
- size_t copied = 0;
- for (const auto& managed_alloc : managed_allocations_) {
- size_t bytes_to_copy = std::min(nbytes - copied,
managed_alloc->allocation_nbytes_);
- if (bytes_to_copy == 0) break;
+ bool IsDirectlyBefore(const MemoryCopy& other) {
+ void* src_end = static_cast<unsigned char*>(src);
+ void* dest_end = static_cast<unsigned char*>(dest);
+ return (src_end == other.src) && (dest_end == other.dest);
+ }
- void* data_plus_copied = static_cast<void*>((static_cast<char*>(data) +
copied));
- int status = hexagon_user_dma_1d_sync(data_plus_copied,
managed_alloc->data_, bytes_to_copy);
- CHECK_EQ(status, 0);
+ void* dest;
+ void* src;
+ size_t num_bytes;
+};
+
+void hexagon_buffer_copy_across_regions(const BufferSet& dest, const
BufferSet& src,
+ size_t bytes_to_copy) {
+ CHECK_LE(bytes_to_copy, src.TotalBytes());
+ CHECK_LE(bytes_to_copy, dest.TotalBytes());
+
+ auto pointer_to = [](const BufferSet& buf, size_t region_i, size_t byte_i)
-> void* {
+ void* region = buf.buffers[region_i];
+ return static_cast<unsigned char*>(region) + byte_i;
+ };
+
+ size_t num_src_regions = (bytes_to_copy + src.region_size_bytes - 1) /
src.region_size_bytes;
+
+ // First, determine all copies that do not cross boundaries in
+ // either source or destination region. This requires two loops, as
+ // a single source region may overlap one or more destination
+ // regions, and vice versa.
+ std::vector<MemoryCopy> micro_copies;
+ for (size_t src_i = 0; src_i < num_src_regions; src_i++) {
+ size_t src_region_begin = src_i * src.region_size_bytes;
+ size_t src_region_end = std::min((src_i + 1) * src.region_size_bytes,
bytes_to_copy);
+
+ size_t dest_i_begin = src_region_begin / dest.region_size_bytes;
+ size_t dest_i_end = (src_region_end - 1) / dest.region_size_bytes + 1;
+ for (size_t dest_i = dest_i_begin; dest_i < dest_i_end; dest_i++) {
+ size_t offset_begin = std::max(src_region_begin, dest_i *
dest.region_size_bytes);
+ size_t offset_end = std::min(src_region_end, (dest_i + 1) *
dest.region_size_bytes);
+
+ size_t num_bytes = offset_end - offset_begin;
+ void* src_ptr = pointer_to(src, src_i, offset_begin %
src.region_size_bytes);
+ void* dest_ptr = pointer_to(dest, dest_i, offset_begin %
dest.region_size_bytes);
+ micro_copies.push_back(MemoryCopy(dest_ptr, src_ptr, num_bytes));
+ }
+ }
+
+ // If regions are contiguously allocated, we can reduce the number
+ // of copies required by merging adjacent copies.
+ std::sort(micro_copies.begin(), micro_copies.end(),
+ [](const MemoryCopy& a, const MemoryCopy& b) { return a.src <
b.src; });
+
+ std::vector<MemoryCopy> macro_copies;
+ for (const auto& copy : micro_copies) {
+ if (macro_copies.size() && macro_copies.back().IsDirectlyBefore(copy)) {
+ macro_copies.back().num_bytes += copy.num_bytes;
+ } else {
+ macro_copies.push_back(copy);
+ }
+ }
- copied += bytes_to_copy;
+ // Finally, do the memory copies.
+ for (const auto& copy : macro_copies) {
+ int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src,
copy.num_bytes);
+ CHECK_EQ(error_code, 0);
}
}
+void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
+ CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged
`external` allocations");
+
+ BufferSet src(allocations_.data(), allocations_.size(),
nbytes_per_allocation_);
+ BufferSet dest(&data, 1, nbytes);
Review Comment:
Are we for now intentionally choosing not to handle the case in which CopyTo
dest and CopyFrom src have more than one memory region?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]