pitrou commented on code in PR #35345: URL: https://github.com/apache/arrow/pull/35345#discussion_r1400882366
########## cpp/src/arrow/array/concatenate.cc: ########## @@ -160,16 +168,144 @@ Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offse // Write offsets into dst, ensuring that the first offset written is // first_offset - auto adjustment = first_offset - src_begin[0]; + auto displacement = first_offset - src_begin[0]; // NOTE: Concatenate can be called during IPC reads to append delta dictionaries. // Avoid UB on non-validated input by doing the addition in the unsigned domain. // (the result can later be validated using Array::ValidateFull) - std::transform(src_begin, src_end, dst, [adjustment](Offset offset) { - return SafeSignedAdd(offset, adjustment); + std::transform(src_begin, src_end, dst, [displacement](Offset offset) { + return SafeSignedAdd(offset, displacement); }); return Status::OK(); } +template <typename offset_type> +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst); + +// Concatenate buffers holding list-view offsets into a single buffer of offsets +// +// value_ranges contains the relevant ranges of values in the child array actually +// referenced to by the views. Most commonly, these ranges will start from 0, +// but when that is not the case, we need to adjust the displacement of offsets. +// The concatenated child array does not contain values from the beginning +// if they are not referenced to by any view. +// +// The child arrays and the sizes buffer are used to ensure we can trust the offsets in +// offset_buffers to be within the valid range. +// +// This function also mutates sizes so that null list-view entries have size 0. +// +// \param[in] in The child arrays +// \param[in,out] sizes The concatenated sizes buffer +template <typename offset_type> +Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, + const BufferVector& offset_buffers, + const std::vector<Range>& value_ranges, + MemoryPool* pool, std::shared_ptr<Buffer>* out) { + DCHECK_EQ(offset_buffers.size(), value_ranges.size()); + + // Allocate resulting offsets buffer and initialize it with zeros + const int64_t out_size_in_bytes = SumBufferSizesInBytes(offset_buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size_in_bytes, pool)); + memset((*out)->mutable_data(), 0, static_cast<size_t>((*out)->size())); + + auto* out_offsets = (*out)->mutable_data_as<offset_type>(); + + int64_t num_child_values = 0; + int64_t elements_length = 0; + for (size_t i = 0; i < offset_buffers.size(); ++i) { + const auto displacement = + static_cast<offset_type>(num_child_values - value_ranges[i].offset); + RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, + /*src=*/*offset_buffers[i], displacement, + /*dst=*/out_offsets + elements_length)); + elements_length += offset_buffers[i]->size() / sizeof(offset_type); + num_child_values += value_ranges[i].length; + if (num_child_values > std::numeric_limits<offset_type>::max()) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + } + DCHECK_EQ(elements_length, + static_cast<int64_t>(out_size_in_bytes / sizeof(offset_type))); + + return Status::OK(); +} + +template <typename offset_type> +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst) { + if (src.size() == 0) { + return Status::OK(); + } + const auto& validity_buffer = input.buffers[0]; + if (validity_buffer) { + // Ensure that it is safe to access all the bits in the validity bitmap of input. + RETURN_NOT_OK(internal::CheckSliceParams(/*size=*/8 * validity_buffer->size(), + input.offset, input.length, "buffer")); + } + + const auto offsets = src.data_as<offset_type>(); + DCHECK_EQ(static_cast<int64_t>(src.size() / sizeof(offset_type)), input.length); + + auto visit_not_null = [&](int64_t position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave the dst[position] as 0. + } + }; + + const auto* validity = validity_buffer ? validity_buffer->data_as<uint8_t>() : nullptr; + internal::OptionalBitBlockCounter bit_counter(validity, input.offset, input.length); + int64_t position = 0; + while (position < input.length) { + internal::BitBlockCount block = bit_counter.NextBlock(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave dst[position] as 0. + } Review Comment: I might be misreading, but is it just the same as `visit_not_null(i)`? ########## cpp/src/arrow/util/list_util.h: ########## @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <utility> + +#include "arrow/array/data.h" +#include "arrow/result.h" + +namespace arrow { +namespace list_util { +namespace internal { + +/// \brief Calculate the smallest continuous range of values used by the +/// var-length list-like input (list, map and list-view types). +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return A pair of (offset, length) describing the range +ARROW_EXPORT Result<std::pair<int64_t, int64_t>> RangeOfValuesUsed( + const ArraySpan& input); + +/// \brief Calculate the sum of the sizes of all valid lists or list-views +/// +/// This is usally the same as the length of the RangeOfValuesUsed() range, but Review Comment: ```suggestion /// This is usually the same as the length of the RangeOfValuesUsed() range, but ``` ########## cpp/src/arrow/array/concatenate_test.cc: ########## @@ -69,33 +99,117 @@ class ConcatenateTest : public ::testing::Test { return slices; } + std::shared_ptr<Buffer> ValidityBitmap(int64_t size, double null_probability) { + return rag_.NullBitmap(size, null_probability, kDefaultBufferAlignment, + default_memory_pool()); + } + template <typename PrimitiveType> - std::shared_ptr<Array> GeneratePrimitive(int64_t size, double null_probability) { + std::shared_ptr<Array> PrimitiveArray(int64_t size, double null_probability) { if (std::is_same<PrimitiveType, BooleanType>::value) { - return rng_.Boolean(size, 0.5, null_probability); + return rag_.Boolean(size, 0.5, null_probability); } - return rng_.Numeric<PrimitiveType, uint8_t>(size, 0, 127, null_probability); + return rag_.Numeric<PrimitiveType, uint8_t>(size, 0, 127, null_probability); + } + + std::shared_ptr<Array> StringArray(int64_t size, double null_probability) { + return rag_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + } + + std::shared_ptr<Array> LargeStringArray(int64_t size, double null_probability) { + return rag_.LargeString(size, /*min_length =*/0, /*max_length =*/15, + null_probability); + } + + std::shared_ptr<Array> StringViewArray(int64_t size, double null_probability) { + return rag_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); + } + + std::shared_ptr<Array> ArrayOf(std::shared_ptr<DataType> type, int64_t size, + double null_probability) { + return rag_.ArrayOf(std::move(type), size, null_probability); + } + + // TODO(GH-38656): Use the random array generators from testing/random.h here + + template <typename ListType, + typename ListArrayType = typename TypeTraits<ListType>::ArrayType> + Result<std::shared_ptr<ListArrayType>> ListArray(int32_t length, + double null_probability) { + using offset_type = typename ListType::offset_type; + using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType; + + auto values_size = length * 4; + auto values = PrimitiveArray<Int8Type>(values_size, null_probability); + auto offsets_vector = Offsets<offset_type>(values_size, length); + // Ensure first and last offsets encompass the whole values array + offsets_vector.front() = 0; Review Comment: Thanks! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org