[GitHub] [arrow] westonpace commented on a change in pull request #11542: ARROW-14356: [C++] Create kernel to determine buffer memory "referenced" by arrays (even if there are offsets)

GitBox Wed, 03 Nov 2021 12:22:56 -0700


westonpace commented on a change in pull request #11542:
URL: https://github.com/apache/arrow/pull/11542#discussion_r742261776




##########
File path: cpp/src/arrow/util/byte_size.cc
##########
@@ -102,6 +108,282 @@ int64_t TotalBufferSize(const Table& table) {
   return DoTotalBufferSize(table, &seen_buffers);
 }
 
+namespace {
+
+struct GetByteRangesArray {
+  const ArrayData& input;
+  int64_t offset;
+  int64_t length;
+  UInt64Builder* range_starts;
+  UInt64Builder* range_offsets;
+  UInt64Builder* range_lengths;
+
+  Status VisitBitmap(const std::shared_ptr<Buffer>& buffer) const {
+    if (buffer) {
+      uint64_t data_start = reinterpret_cast<uint64_t>(buffer->data());
+      RETURN_NOT_OK(range_starts->Append(data_start));
+      RETURN_NOT_OK(range_offsets->Append(BitUtil::RoundDown(offset, 8) / 8));
+      RETURN_NOT_OK(range_lengths->Append(BitUtil::CoveringBytes(offset, 
length)));
+    }
+    return Status::OK();
+  }
+
+  Status VisitFixedWidthArray(const Buffer& buffer, const FixedWidthType& 
type) const {
+    uint64_t data_start = reinterpret_cast<uint64_t>(buffer.data());
+    uint64_t offset_bits = offset * type.bit_width();
+    uint64_t offset_bytes = 
BitUtil::RoundDown(static_cast<int64_t>(offset_bits), 8) / 8;
+    uint64_t end_byte =
+        BitUtil::RoundUp(static_cast<int64_t>(offset_bits + (length * 
type.bit_width())),
+                         8) /
+        8;
+    uint64_t length_bytes = (end_byte - offset_bytes);
+    RETURN_NOT_OK(range_starts->Append(data_start));
+    RETURN_NOT_OK(range_offsets->Append(offset_bytes));
+    return range_lengths->Append(length_bytes);
+  }
+
+  Status Visit(const FixedWidthType& type) const {
+    static_assert(sizeof(uint8_t*) <= sizeof(uint64_t),
+                  "Undefined behavior if pointer larger than uint64_t");
+    RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
+    RETURN_NOT_OK(VisitFixedWidthArray(*input.buffers[1], type));
+    if (input.dictionary) {
+      // This is slightly imprecise because we always assume the entire 
dictionary is
+      // referenced.  If this array has an offset it may only be referencing a 
portion of
+      // the dictionary

Review comment:
       I see what you are saying but if we had some trick to easily account for 
it (without iterating through the data) I'd use it.  My goal here is for 
profiling so I'd like to understand how long it takes to move X bytes of data 
through the system.  My thinking of this measure is it is sort of a "minimum 
bytes required to represent the data in the Arrow format".  Ideally it should 
be consistent regardless of file format or row group configuration, etc.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] westonpace commented on a change in pull request #11542: ARROW-14356: [C++] Create kernel to determine buffer memory "referenced" by arrays (even if there are offsets)

Reply via email to