westonpace commented on a change in pull request #11542:
URL: https://github.com/apache/arrow/pull/11542#discussion_r742261776
##########
File path: cpp/src/arrow/util/byte_size.cc
##########
@@ -102,6 +108,282 @@ int64_t TotalBufferSize(const Table& table) {
return DoTotalBufferSize(table, &seen_buffers);
}
+namespace {
+
+struct GetByteRangesArray {
+ const ArrayData& input;
+ int64_t offset;
+ int64_t length;
+ UInt64Builder* range_starts;
+ UInt64Builder* range_offsets;
+ UInt64Builder* range_lengths;
+
+ Status VisitBitmap(const std::shared_ptr<Buffer>& buffer) const {
+ if (buffer) {
+ uint64_t data_start = reinterpret_cast<uint64_t>(buffer->data());
+ RETURN_NOT_OK(range_starts->Append(data_start));
+ RETURN_NOT_OK(range_offsets->Append(BitUtil::RoundDown(offset, 8) / 8));
+ RETURN_NOT_OK(range_lengths->Append(BitUtil::CoveringBytes(offset,
length)));
+ }
+ return Status::OK();
+ }
+
+ Status VisitFixedWidthArray(const Buffer& buffer, const FixedWidthType&
type) const {
+ uint64_t data_start = reinterpret_cast<uint64_t>(buffer.data());
+ uint64_t offset_bits = offset * type.bit_width();
+ uint64_t offset_bytes =
BitUtil::RoundDown(static_cast<int64_t>(offset_bits), 8) / 8;
+ uint64_t end_byte =
+ BitUtil::RoundUp(static_cast<int64_t>(offset_bits + (length *
type.bit_width())),
+ 8) /
+ 8;
+ uint64_t length_bytes = (end_byte - offset_bytes);
+ RETURN_NOT_OK(range_starts->Append(data_start));
+ RETURN_NOT_OK(range_offsets->Append(offset_bytes));
+ return range_lengths->Append(length_bytes);
+ }
+
+ Status Visit(const FixedWidthType& type) const {
+ static_assert(sizeof(uint8_t*) <= sizeof(uint64_t),
+ "Undefined behavior if pointer larger than uint64_t");
+ RETURN_NOT_OK(VisitBitmap(input.buffers[0]));
+ RETURN_NOT_OK(VisitFixedWidthArray(*input.buffers[1], type));
+ if (input.dictionary) {
+ // This is slightly imprecise because we always assume the entire
dictionary is
+ // referenced. If this array has an offset it may only be referencing a
portion of
+ // the dictionary
Review comment:
I see what you are saying but if we had some trick to easily account for
it (without iterating through the data) I'd use it. My goal here is for
profiling so I'd like to understand how long it takes to move X bytes of data
through the system. My thinking of this measure is it is sort of a "minimum
bytes required to represent the data in the Arrow format". Ideally it should
be consistent regardless of file format or row group configuration, etc.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]