felipecrv commented on code in PR #41297: URL: https://github.com/apache/arrow/pull/41297#discussion_r1588285198
########## cpp/src/arrow/util/fixed_width_internal.h: ########## @@ -0,0 +1,364 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cassert> +#include <cstdint> + +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +namespace arrow::compute { +// XXX: remove dependency on compute::KernelContext +class KernelContext; +} // namespace arrow::compute + +namespace arrow::util { + +/// \brief Checks if the given array has a fixed-width type or if it's an array of +/// fixed-size list that can be flattened to an array of fixed-width values. +/// +/// Fixed-width types are the ones defined by the is_fixed_width() predicate in +/// type_traits.h. They are all the types that passes any of the following +/// predicates: +/// +/// - is_primitive() +/// - is_fixed_size_binary() +/// - is_dictionary() +/// +/// At least 3 types in this set require special care: +/// - `Type::BOOL` is fixed-width, but it's a 1-bit type and pointers to first bit +/// in boolean buffers are not always aligned to byte boundaries. +/// - `Type::DICTIONARY` is fixed-width because the indices are fixed-width, but the +/// dictionary values are not necessarily fixed-width and have to be managed +/// by separate operations. +/// - Type::FIXED_SIZE_BINARY unlike other fixed-width types, fixed-size binary +/// values are defined by a size attribute that is not known at compile time. +/// The other types have power-of-2 byte widths, while fixed-size binary can +/// have any byte width including 0. +/// +/// Additionally, we say that a type is "fixed-width like" if it's a fixed-width as +/// defined above, or if it's a fixed-size list (or nested fixed-size lists) and +/// the innermost type is fixed-width and the following restrictions also apply: +/// - The value type of the innermost fixed-size list is not BOOL (it has to be excluded +/// because a 1-bit type doesn't byte-align) +/// - Only the top-level array may have nulls, all the inner array have to be completely +/// free of nulls so we don't need to manage internal validity bitmaps. +/// +/// Take the following `fixed_size_list<fixed_size_list<int32, 2>, 3>` array as an +/// example: +/// +/// [ +/// [[1, 2], [3, 4], [ 5, 6]], +/// null, +/// [[7, 8], [9, 10], [11, 12]] +/// ] +/// +/// in memory, it would look like: +/// +/// { +/// type: fixed_size_list<fixed_size_list<int32, 2>, 3>, +/// length: 3, +/// null_count: 1, +/// offset: 0, +/// buffers: [ +/// 0: [0b00000101] +/// ], +/// child_data: [ +/// 0: { +/// type: fixed_size_list<int32, 2>, +/// length: 9, +/// null_count: 0, +/// offset: 0, +/// buffers: [0: NULL], +/// child_data: [ +/// 0: { +/// type: int32, +/// length: 18, +/// null_count: 0, +/// offset: 0, +/// buffers: [ +/// 0: NULL, +/// 1: [ 1, 2, 3, 4, 5, 6, +/// 0, 0, 0, 0, 0, 0 +/// 7, 8, 9, 10, 11, 12 ] +/// ], +/// child_data: [] +/// } +/// ] +/// } +/// ] +/// } +/// +/// This layout fits the fixed-width like definition because the innermost type +/// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't +/// have nulls. The validity bitmap is only needed at the top-level array. +/// +/// Writing to this array can be done in the same way writing to a flat fixed-width +/// array is done, by: +/// 1. Updating the validity bitmap at the top-level array if nulls are present. +/// 2. Updating a continuous fixed-width block of memory through a single pointer. +/// +/// The length of this block of memory is the product of the list sizes in the +/// `FixedSizeList` types and the byte width of the innermost fixed-width type: +/// +/// 3 * 2 * 4 = 24 bytes +/// +/// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by +/// simply setting the validity bit to 1 and writing the 24-byte sequence of +/// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`. +/// +/// The length of the top-level array fully defines the lengths that all the nested +/// arrays must have, which makes defining all the lengths as easy as defining the +/// length of the top-level array. +/// +/// length = 3 +/// child_data[0].length == 3 * 3 == 9 +/// child_data[0].child_data[0].length == 3 * 3 * 2 == 18 +/// +/// child_data[0].child_data[0].buffers[1].size() >= +/// (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72) +/// +/// Dealing with offsets is a bit involved. Let's say the array described above has +/// the offsets 2, 5, and 7: +/// +/// { +/// type: fixed_size_list<fixed_size_list<int32, 2>, 3>, +/// offset: 2, +/// ... +/// child_data: [ +/// 0: { +/// type: fixed_size_list<int32, 2>, +/// offset: 5, +/// ... +/// child_data: [ +/// 0: { +/// type: int32, +/// offset: 7, +/// buffers: [ +/// 0: NULL, +/// 1: [ 1, 1, 1, 1, 1, 1, 1, // 7 values skipped +/// 0,1, 0,1, 0,1, 0,1, 0,1, // 5 [x,x] values skipped +/// +/// 0,0,0,0,0,1, // +/// 0,0,0,0,0,1, // 2 [[x,x], [x,x], [x,x]] values skipped +/// +/// 1, 2, 3, 4, 5, 6, // +/// 0, 0, 0, 0, 0, 0 // the actual values +/// 7, 8, 9, 10, 11, 12 // +/// ] +/// ], +/// } +/// ] +/// } +/// ] +/// } +/// +/// The offset of the innermost values buffer, in bytes, is calculated as: +/// +/// ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes +/// +/// In general, the formula to calculate the offset of the innermost values buffer is: +/// +/// ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off) +/// * sizeof(innermost_type) +/// +/// `OffsetPointerOfFixedWidthValues()` can calculate this byte offset and return the +/// pointer to the first relevant byte of the innermost values buffer. +/// +/// \param source The array to check +/// \param force_null_count If true, GetNullCount() is used instead of null_count +ARROW_EXPORT bool IsFixedWidthLike(const ArraySpan& source, + bool force_null_count = false); + +/// \brief Checks if the given array has a fixed-width type or if it's an array of +/// fixed-size list that can be flattened to an array of fixed-width values. +/// +/// This function is a more general version of +/// `IsFixedWidthLike(const ArraySpan&, bool)` that allows the caller to further +/// restrict the inner value types that should be considered fixed-width. +/// +/// \param source The array to check +/// \param force_null_count If true, GetNullCount() is used instead of null_count +/// \param extra_predicate A DataType predicate that can be used to further +/// restrict the types that are considered fixed-width Review Comment: Pushed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
