felipecrv commented on code in PR #41297:
URL: https://github.com/apache/arrow/pull/41297#discussion_r1594703511


##########
cpp/src/arrow/util/fixed_width_internal.h:
##########
@@ -0,0 +1,307 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+
+namespace arrow::compute {
+// XXX: remove dependency on compute::KernelContext
+class KernelContext;
+}  // namespace arrow::compute
+
+namespace arrow::util {
+
+/// \brief Checks if the given array has a fixed-width type or if it's an 
array of
+/// fixed-size list that can be flattened to an array of fixed-width values.
+///
+/// Fixed-width types are the ones defined by the is_fixed_width() predicate in
+/// type_traits.h. They are all the types that passes any of the following
+/// predicates:
+///
+///  - is_primitive()
+///  - is_fixed_size_binary()
+///  - is_dictionary()
+///
+/// At least 3 types in this set require special care:
+///  - `Type::BOOL` is fixed-width, but it's a 1-bit type and pointers to 
first bit
+///    in boolean buffers are not always aligned to byte boundaries.
+///  - `Type::DICTIONARY` is fixed-width because the indices are fixed-width, 
but the
+///    dictionary values are not necessarily fixed-width and have to be managed
+///    by separate operations.
+///  - Type::FIXED_SIZE_BINARY unlike other fixed-width types, fixed-size 
binary
+///    values are defined by a size attribute that is not known at compile 
time.
+///    The other types have power-of-2 byte widths, while fixed-size binary can
+///    have any byte width including 0.
+///
+/// Additionally, we say that a type is "fixed-width like" if it's a 
fixed-width as
+/// defined above, or if it's a fixed-size list (or nested fixed-size lists) 
and
+/// the innermost type is fixed-width and the following restrictions also 
apply:
+///  - The value type of the innermost fixed-size list is not BOOL (it has to 
be excluded
+///    because a 1-bit type doesn't byte-align)
+///  - Only the top-level array may have nulls, all the inner array have to be 
completely
+///    free of nulls so we don't need to manage internal validity bitmaps.
+///
+/// Take the following `fixed_size_list<fixed_size_list<int32, 2>, 3>` array 
as an
+/// example:
+///
+///     [
+///       [[1, 2], [3,  4], [ 5,  6]],
+///       null,
+///       [[7, 8], [9, 10], [11, 12]]
+///     ]
+///
+/// in memory, it would look like:
+///
+///     {
+///        type: fixed_size_list<fixed_size_list<int32, 2>, 3>,
+///        length: 3,
+///        null_count: 1,
+///        offset: 0,
+///        buffers: [
+///          0: [0b00000101]
+///        ],
+///        child_data: [
+///          0: {
+///            type: fixed_size_list<int32, 2>,
+///            length: 9,
+///            null_count: 0,
+///            offset: 0,
+///            buffers: [0: NULL],
+///            child_data: [
+///              0: {
+///                type: int32,
+///                length: 18,
+///                null_count: 0,
+///                offset: 0,
+///                buffers: [
+///                  0: NULL,
+///                  1: [ 1,  2,  3,  4,  5,  6,
+///                       0,  0,  0,  0,  0,  0
+///                       7,  8,  9, 10, 11, 12 ]
+///                ],
+///                child_data: []
+///              }
+///            ]
+///          }
+///        ]
+///     }
+///
+/// This layout fits the fixed-width like definition because the innermost type
+/// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't
+/// have nulls. The validity bitmap is only needed at the top-level array.
+///
+/// Writing to this array can be done in the same way writing to a flat 
fixed-width
+/// array is done, by:
+/// 1. Updating the validity bitmap at the top-level array if nulls are 
present.
+/// 2. Updating a continuous fixed-width block of memory through a single 
pointer.
+///
+/// The length of this block of memory is the product of the list sizes in the
+/// `FixedSizeList` types and the byte width of the innermost fixed-width type:
+///
+///     3 * 2 * 4 = 24 bytes
+///
+/// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done 
by
+/// simply setting the validity bit to 1 and writing the 24-byte sequence of
+/// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 
24`.
+///
+/// The length of the top-level array fully defines the lengths that all the 
nested
+/// arrays must have, which makes defining all the lengths as easy as defining 
the
+/// length of the top-level array.
+///
+///     length = 3
+///     child_data[0].length == 3 * 3 == 9
+///     child_data[0].child_data[0].length == 3 * 3 * 2 == 18
+///
+///     child_data[0].child_data[0].buffers[1].size() >=
+///       (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72)
+///
+/// Dealing with offsets is a bit involved. Let's say the array described 
above has
+/// the offsets 2, 5, and 7:
+///
+///     {
+///        type: fixed_size_list<fixed_size_list<int32, 2>, 3>,
+///        offset: 2,
+///        ...
+///        child_data: [
+///          0: {
+///            type: fixed_size_list<int32, 2>,
+///            offset: 5,
+///            ...
+///            child_data: [
+///              0: {
+///                type: int32,
+///                offset: 7,
+///                buffers: [
+///                  0: NULL,
+///                  1: [ 1, 1, 1, 1, 1, 1, 1,      // 7 values skipped
+///                       0,1, 0,1, 0,1, 0,1, 0,1,  // 5 [x,x] values skipped
+///
+///                       0,0,0,0,0,1,  //
+///                       0,0,0,0,0,1,  // 2 [[x,x], [x,x], [x,x]] values 
skipped
+///
+///                       1,  2,  3,  4,  5,  6,  //
+///                       0,  0,  0,  0,  0,  0   // the actual values
+///                       7,  8,  9, 10, 11, 12   //
+///                     ]
+///                ],
+///              }
+///            ]
+///          }
+///        ]
+///     }
+///
+/// The offset of the innermost values buffer, in bytes, is calculated as:
+///
+///     ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes
+///
+/// In general, the formula to calculate the offset of the innermost values 
buffer is:
+///
+///     ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off)
+///        * sizeof(innermost_type)
+///
+/// `OffsetPointerOfFixedWidthValues()` can calculate this byte offset and 
return the
+/// pointer to the first relevant byte of the innermost values buffer.
+///
+/// \param source The array to check
+/// \param force_null_count If true, GetNullCount() is used instead of 
null_count
+/// \param exclude_dictionary If true, DICTIONARY is excluded from the
+///                           is_fixed_width() types. Default: false.
+ARROW_EXPORT bool IsFixedWidthLike(const ArraySpan& source, bool 
force_null_count = false,
+                                   bool exclude_dictionary = false);
+
+/// \brief Checks if the given array has a fixed-width type or if it's an 
array of
+/// fixed-size list that can be flattened to an array of fixed-width values.
+///
+/// This function is a more general version of
+/// `IsFixedWidthLike(const ArraySpan&, bool)` that allows the caller to 
further
+/// restrict the inner value types that should be considered fixed-width.
+///
+/// \param source The array to check
+/// \param force_null_count If true, GetNullCount() is used instead of 
null_count
+/// \param extra_predicate A DataType predicate that can be used to further
+///                        restrict the types that are considered fixed-width
+template <class ExtraPred>
+inline bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count,
+                             ExtraPred extra_predicate) {
+  const auto* type = source.type;
+  // BOOL is considered fixed-width if not nested under FIXED_SIZE_LIST.

Review Comment:
   API change in #41597.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to