felipecrv commented on code in PR #35345: URL: https://github.com/apache/arrow/pull/35345#discussion_r1402168380
########## cpp/src/arrow/array/concatenate.cc: ########## @@ -160,16 +168,144 @@ Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offse // Write offsets into dst, ensuring that the first offset written is // first_offset - auto adjustment = first_offset - src_begin[0]; + auto displacement = first_offset - src_begin[0]; // NOTE: Concatenate can be called during IPC reads to append delta dictionaries. // Avoid UB on non-validated input by doing the addition in the unsigned domain. // (the result can later be validated using Array::ValidateFull) - std::transform(src_begin, src_end, dst, [adjustment](Offset offset) { - return SafeSignedAdd(offset, adjustment); + std::transform(src_begin, src_end, dst, [displacement](Offset offset) { + return SafeSignedAdd(offset, displacement); }); return Status::OK(); } +template <typename offset_type> +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst); + +// Concatenate buffers holding list-view offsets into a single buffer of offsets +// +// value_ranges contains the relevant ranges of values in the child array actually +// referenced to by the views. Most commonly, these ranges will start from 0, +// but when that is not the case, we need to adjust the displacement of offsets. +// The concatenated child array does not contain values from the beginning +// if they are not referenced to by any view. +// +// The child arrays and the sizes buffer are used to ensure we can trust the offsets in +// offset_buffers to be within the valid range. +// +// This function also mutates sizes so that null list-view entries have size 0. +// +// \param[in] in The child arrays +// \param[in,out] sizes The concatenated sizes buffer +template <typename offset_type> +Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, + const BufferVector& offset_buffers, + const std::vector<Range>& value_ranges, + MemoryPool* pool, std::shared_ptr<Buffer>* out) { + DCHECK_EQ(offset_buffers.size(), value_ranges.size()); + + // Allocate resulting offsets buffer and initialize it with zeros + const int64_t out_size_in_bytes = SumBufferSizesInBytes(offset_buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size_in_bytes, pool)); + memset((*out)->mutable_data(), 0, static_cast<size_t>((*out)->size())); + + auto* out_offsets = (*out)->mutable_data_as<offset_type>(); + + int64_t num_child_values = 0; + int64_t elements_length = 0; + for (size_t i = 0; i < offset_buffers.size(); ++i) { + const auto displacement = + static_cast<offset_type>(num_child_values - value_ranges[i].offset); + RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, + /*src=*/*offset_buffers[i], displacement, + /*dst=*/out_offsets + elements_length)); + elements_length += offset_buffers[i]->size() / sizeof(offset_type); + num_child_values += value_ranges[i].length; + if (num_child_values > std::numeric_limits<offset_type>::max()) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + } + DCHECK_EQ(elements_length, + static_cast<int64_t>(out_size_in_bytes / sizeof(offset_type))); + + return Status::OK(); +} + +template <typename offset_type> +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst) { + if (src.size() == 0) { + return Status::OK(); + } + const auto& validity_buffer = input.buffers[0]; + if (validity_buffer) { + // Ensure that it is safe to access all the bits in the validity bitmap of input. + RETURN_NOT_OK(internal::CheckSliceParams(/*size=*/8 * validity_buffer->size(), + input.offset, input.length, "buffer")); + } + + const auto offsets = src.data_as<offset_type>(); + DCHECK_EQ(static_cast<int64_t>(src.size() / sizeof(offset_type)), input.length); + + auto visit_not_null = [&](int64_t position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave the dst[position] as 0. + } + }; + + const auto* validity = validity_buffer ? validity_buffer->data_as<uint8_t>() : nullptr; + internal::OptionalBitBlockCounter bit_counter(validity, input.offset, input.length); + int64_t position = 0; + while (position < input.length) { + internal::BitBlockCount block = bit_counter.NextBlock(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave dst[position] as 0. + } Review Comment: You're right. I extracted the function from below when I noticed the dup, but forgot to do the reverse-inlining above. Pushing soon. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org