pitrou commented on code in PR #13218:
URL: https://github.com/apache/arrow/pull/13218#discussion_r885723070
##########
cpp/src/arrow/compute/exec/key_hash.h:
##########
@@ -45,8 +45,8 @@ class ARROW_EXPORT Hashing32 {
friend void TestBloomSmall(BloomFilterBuildStrategy, int64_t, int, bool,
bool);
public:
- static void HashMultiColumn(const std::vector<KeyColumnArray>& cols,
- KeyEncoder::KeyEncoderContext* ctx, uint32_t*
out_hash);
+ static void HashMultiColumn(const std::vector<KeyColumnArray>& cols,
LightContext* ctx,
+ uint32_t* out_hash);
Review Comment:
For the record, is this a class with only static methods/attributes? This
seems like an anti-pattern.
##########
cpp/src/arrow/compute/api_aggregate.h:
##########
@@ -482,6 +404,21 @@ struct ARROW_EXPORT Aggregate {
const FunctionOptions* options;
};
+Result<std::vector<const HashAggregateKernel*>> GetKernels(
Review Comment:
Do we need to expose these APIs here, or can there be a separate header file
for internal hash-aggregation APIs?
##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+/// Description of the data stored in a RowTable
+struct ARROW_EXPORT RowTableMetadata {
+ /// \brief True if there are no variable length columns in the table
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key
columns,
+ /// including lengths of varying-length columns, rounded up to the multiple
of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the
previous
+ /// varbinary field needs to be rounded up according to the specified
alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are
encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at an offset aligned to that number of
bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// choosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
string_alignment,
+ const KeyColumnMetadata&
col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// \brief An array of mutable offsets within a row of ends of varbinary
fields.
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary
field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t*
offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further
varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int
varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return
column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return
column_offsets[icol]; }
+
+ uint32_t num_cols() const { return
static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ /// \brief Populate this instance to describe `cols` with the given alignment
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ /// \brief True if `other` has the same number of columns
+ /// and each column has the same width (two variable length
+ /// columns are considered to have the same width)
+ bool is_compatible(const RowTableMetadata& other) const;
+};
+
+/// \brief A table of data stored in row-major order
+///
+/// Can only store non-nested data types
+///
+/// Can store both fixed-size data types and variable-length data types
+class ARROW_EXPORT RowTableImpl {
+ public:
+ RowTableImpl();
+ /// \brief Initialize a row array for use
+ ///
+ /// This must be called before any other method
+ Status Init(MemoryPool* pool, const RowTableMetadata& metadata);
+ /// \brief Clear all rows from the table
+ ///
+ /// Does not shrink buffers
+ void Clean();
+ /// \brief Add empty rows
+ /// \param num_rows_to_append The number of empty rows to append
+ /// \param num_extra_bytes_to_append For tables storing variable-length data
this
+ /// should be a guess of how many data bytes will be needed to populate
the
+ /// data. This is ignored if there are no variable-length columns
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t
num_extra_bytes_to_append);
+ /// \brief Append rows from a source table
+ /// \param from The table to append from
+ /// \param num_rows_to_append The number of rows to append
+ /// \param source_row_ids Indices (into `from`) of the desired rows
+ Status AppendSelectionFrom(const RowTableImpl& from, uint32_t
num_rows_to_append,
+ const uint16_t* source_row_ids);
+ /// \brief Metadata describing the data stored in this table
+ const RowTableMetadata& metadata() const { return metadata_; }
+ /// \brief The number of rows stored in the table
+ int64_t length() const { return num_rows_; }
+ // Accessors into the table's buffers
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
Review Comment:
Shouldn't this be
```suggestion
ARROW_DCHECK(i >= 0 && i < max_buffers_);
```
##########
cpp/src/arrow/compute/light_array.h:
##########
@@ -31,6 +33,18 @@
namespace arrow {
namespace compute {
+/// \brief Context needed by various execution engine operations
+///
+/// In the execution engine this context is provided by either the node or the
+/// plan and the context exists for the lifetime of the plan. Defining this
here
+/// allows us to take advantage of these resources without coupling the logic
with
+/// the execution engine.
+struct LightContext {
+ bool has_avx2() const { return (hardware_flags &
arrow::internal::CpuInfo::AVX2) > 0; }
Review Comment:
Why is this no using `CpuInfo::IsSupported(CpuInfo::AVX2)`?
##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+/// Description of the data stored in a RowTable
+struct ARROW_EXPORT RowTableMetadata {
+ /// \brief True if there are no variable length columns in the table
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key
columns,
+ /// including lengths of varying-length columns, rounded up to the multiple
of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the
previous
+ /// varbinary field needs to be rounded up according to the specified
alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are
encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at an offset aligned to that number of
bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// choosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
string_alignment,
+ const KeyColumnMetadata&
col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// \brief An array of mutable offsets within a row of ends of varbinary
fields.
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary
field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t*
offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further
varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int
varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return
column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return
column_offsets[icol]; }
+
+ uint32_t num_cols() const { return
static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ /// \brief Populate this instance to describe `cols` with the given alignment
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ /// \brief True if `other` has the same number of columns
+ /// and each column has the same width (two variable length
+ /// columns are considered to have the same width)
+ bool is_compatible(const RowTableMetadata& other) const;
+};
+
+/// \brief A table of data stored in row-major order
+///
+/// Can only store non-nested data types
+///
+/// Can store both fixed-size data types and variable-length data types
+class ARROW_EXPORT RowTableImpl {
+ public:
+ RowTableImpl();
+ /// \brief Initialize a row array for use
+ ///
+ /// This must be called before any other method
+ Status Init(MemoryPool* pool, const RowTableMetadata& metadata);
+ /// \brief Clear all rows from the table
+ ///
+ /// Does not shrink buffers
+ void Clean();
+ /// \brief Add empty rows
+ /// \param num_rows_to_append The number of empty rows to append
+ /// \param num_extra_bytes_to_append For tables storing variable-length data
this
+ /// should be a guess of how many data bytes will be needed to populate
the
+ /// data. This is ignored if there are no variable-length columns
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t
num_extra_bytes_to_append);
+ /// \brief Append rows from a source table
+ /// \param from The table to append from
+ /// \param num_rows_to_append The number of rows to append
+ /// \param source_row_ids Indices (into `from`) of the desired rows
+ Status AppendSelectionFrom(const RowTableImpl& from, uint32_t
num_rows_to_append,
+ const uint16_t* source_row_ids);
+ /// \brief Metadata describing the data stored in this table
+ const RowTableMetadata& metadata() const { return metadata_; }
+ /// \brief The number of rows stored in the table
+ int64_t length() const { return num_rows_; }
+ // Accessors into the table's buffers
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
Review Comment:
Same here.
##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+/// Description of the data stored in a RowTable
+struct ARROW_EXPORT RowTableMetadata {
+ /// \brief True if there are no variable length columns in the table
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key
columns,
+ /// including lengths of varying-length columns, rounded up to the multiple
of string
+ /// alignment.
+ uint32_t fixed_length;
Review Comment:
Why are some sizes or quantities unsigned and other signed?
##########
cpp/src/arrow/compute/row/encode_internal.h:
##########
@@ -0,0 +1,323 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/exec/util.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/compute/row/row_internal.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace compute {
+
+/// Converts between Arrow's typical column representation to a row-based
representation
+///
+/// Data is stored as a single array of rows. Each row combines data from all
columns.
+/// The conversion is reversible.
+///
+/// Row-oriented storage is beneficial when there is a need for random access
+/// of individual rows and at the same time all included columns are likely to
+/// be accessed together, as in the case of hash table key.
+///
+/// Does not support nested types
+class RowTableEncoder {
+ public:
+ void Init(const std::vector<KeyColumnMetadata>& cols, LightContext* ctx,
+ int row_alignment, int string_alignment);
+
+ const RowTableMetadata& row_metadata() { return row_metadata_; }
+ // GrouperFastImpl right now needs somewhat intrusive visibility into
RowTableEncoder
+ // This could be cleaned up at some point
+ const std::vector<KeyColumnArray>& batch_all_cols() { return
batch_all_cols_; }
+
+ /// \brief Prepare to encode a collection of columns
+ /// \param start_row The starting row to encode
+ /// \param num_rows The number of rows to encode
+ /// \param cols The columns to encode. The order of the columns should
+ /// be consistent with the order used to create the
RowTableMetadata
+ void PrepareEncodeSelected(int64_t start_row, int64_t num_rows,
+ const std::vector<KeyColumnArray>& cols);
+ /// \brief Encode selection of prepared rows into a row table
+ /// \param rows The output row table
+ /// \param num_selected The number of rows to encode
+ /// \param selection indices of the rows to encode
+ Status EncodeSelected(RowTableImpl* rows, uint32_t num_selected,
+ const uint16_t* selection);
+
+ /// \brief Decode a window of row oriented data into a corresponding
+ /// window of column oriented storage.
+ /// \param start_row_input The starting row to decode
+ /// \param start_row_output An offset into the output array to write to
+ /// \param num_rows The number of rows to decode
+ /// \param rows The row table to decode from
+ /// \param cols The columns to decode into, should be sized appropriately
+ ///
+ /// The output buffers need to be correctly allocated and sized before
+ /// calling each method. For that reason decoding is split into two
functions.
+ /// DecodeFixedLengthBuffers processes everything except for varying length
+ /// buffers.
+ /// The output can be used to find out required varying length buffers sizes
+ /// for the call to DecodeVaryingLengthBuffers
+ void DecodeFixedLengthBuffers(int64_t start_row_input, int64_t
start_row_output,
+ int64_t num_rows, const RowTableImpl& rows,
+ std::vector<KeyColumnArray>* cols);
+
+ /// \brief Decode the varlength columns of a row table into column storage
+ /// \param start_row_input The starting row to decode
+ /// \param start_row_output An offset into the output arrays
+ /// \param num_rows The number of rows to decode
+ /// \param rows The row table to decode from
+ /// \param cols The column arrays to decode into
+ void DecodeVaryingLengthBuffers(int64_t start_row_input, int64_t
start_row_output,
+ int64_t num_rows, const RowTableImpl& rows,
+ std::vector<KeyColumnArray>* cols);
+
+ private:
+ /// Prepare column array vectors.
+ /// Output column arrays represent a range of input column arrays
+ /// specified by starting row and number of rows.
+ /// Three vectors are generated:
+ /// - all columns
+ /// - fixed-length columns only
+ /// - varying-length columns only
+ void PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
+ const std::vector<KeyColumnArray>& cols_in);
+
+ LightContext* ctx_;
+
+ // Data initialized once, based on data types of key columns
+ RowTableMetadata row_metadata_;
+
+ // Data initialized for each input batch.
+ // All elements are ordered according to the order of encoded fields in a
row.
+ std::vector<KeyColumnArray> batch_all_cols_;
+ std::vector<KeyColumnArray> batch_varbinary_cols_;
+ std::vector<uint32_t> batch_varbinary_cols_base_offsets_;
+};
+
+class EncoderInteger {
Review Comment:
Do these all have to be exposed in a `.h`?
##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+/// Description of the data stored in a RowTable
+struct ARROW_EXPORT RowTableMetadata {
+ /// \brief True if there are no variable length columns in the table
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key
columns,
+ /// including lengths of varying-length columns, rounded up to the multiple
of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the
previous
+ /// varbinary field needs to be rounded up according to the specified
alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are
encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at an offset aligned to that number of
bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// choosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
string_alignment,
+ const KeyColumnMetadata&
col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// \brief An array of mutable offsets within a row of ends of varbinary
fields.
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary
field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t*
offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further
varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int
varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return
column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return
column_offsets[icol]; }
+
+ uint32_t num_cols() const { return
static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ /// \brief Populate this instance to describe `cols` with the given alignment
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ /// \brief True if `other` has the same number of columns
+ /// and each column has the same width (two variable length
+ /// columns are considered to have the same width)
+ bool is_compatible(const RowTableMetadata& other) const;
+};
+
+/// \brief A table of data stored in row-major order
+///
+/// Can only store non-nested data types
+///
+/// Can store both fixed-size data types and variable-length data types
+class ARROW_EXPORT RowTableImpl {
+ public:
+ RowTableImpl();
+ /// \brief Initialize a row array for use
+ ///
+ /// This must be called before any other method
+ Status Init(MemoryPool* pool, const RowTableMetadata& metadata);
+ /// \brief Clear all rows from the table
+ ///
+ /// Does not shrink buffers
+ void Clean();
+ /// \brief Add empty rows
+ /// \param num_rows_to_append The number of empty rows to append
+ /// \param num_extra_bytes_to_append For tables storing variable-length data
this
+ /// should be a guess of how many data bytes will be needed to populate
the
+ /// data. This is ignored if there are no variable-length columns
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t
num_extra_bytes_to_append);
+ /// \brief Append rows from a source table
+ /// \param from The table to append from
+ /// \param num_rows_to_append The number of rows to append
+ /// \param source_row_ids Indices (into `from`) of the desired rows
+ Status AppendSelectionFrom(const RowTableImpl& from, uint32_t
num_rows_to_append,
+ const uint16_t* source_row_ids);
+ /// \brief Metadata describing the data stored in this table
+ const RowTableMetadata& metadata() const { return metadata_; }
+ /// \brief The number of rows stored in the table
+ int64_t length() const { return num_rows_; }
+ // Accessors into the table's buffers
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint32_t* offsets() const { return reinterpret_cast<const
uint32_t*>(data(1)); }
+ uint32_t* mutable_offsets() { return
reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint8_t* null_masks() const { return null_masks_->data(); }
+ uint8_t* null_masks() { return null_masks_->mutable_data(); }
+
+ /// \brief True if there is a null value anywhere in the table
+ ///
+ /// This calculation is memoized based on the number of rows and assumes
+ /// that values are only appended (and not modified in place) between
+ /// successive calls
+ bool has_any_nulls(const LightContext* ctx) const;
+
+ private:
+ Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
+ Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
+
+ // Helper functions to determine the number of bytes needed for each
+ // buffer given a number of rows.
+ int64_t size_null_masks(int64_t num_rows);
+ int64_t size_offsets(int64_t num_rows);
+ int64_t size_rows_fixed_length(int64_t num_rows);
+ int64_t size_rows_varying_length(int64_t num_bytes);
+ // Called after resize to fix pointers
+ void update_buffer_pointers();
+
+ static constexpr int64_t padding_for_vectors = 64;
Review Comment:
```suggestion
static constexpr int64_t kPaddingForVectors = 64;
```
##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+/// Description of the data stored in a RowTable
+struct ARROW_EXPORT RowTableMetadata {
+ /// \brief True if there are no variable length columns in the table
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key
columns,
+ /// including lengths of varying-length columns, rounded up to the multiple
of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the
previous
+ /// varbinary field needs to be rounded up according to the specified
alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are
encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at an offset aligned to that number of
bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// choosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
string_alignment,
+ const KeyColumnMetadata&
col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// \brief An array of mutable offsets within a row of ends of varbinary
fields.
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary
field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t*
offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further
varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int
varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return
column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return
column_offsets[icol]; }
+
+ uint32_t num_cols() const { return
static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ /// \brief Populate this instance to describe `cols` with the given alignment
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ /// \brief True if `other` has the same number of columns
+ /// and each column has the same width (two variable length
+ /// columns are considered to have the same width)
+ bool is_compatible(const RowTableMetadata& other) const;
+};
+
+/// \brief A table of data stored in row-major order
+///
+/// Can only store non-nested data types
+///
+/// Can store both fixed-size data types and variable-length data types
+class ARROW_EXPORT RowTableImpl {
+ public:
+ RowTableImpl();
+ /// \brief Initialize a row array for use
+ ///
+ /// This must be called before any other method
+ Status Init(MemoryPool* pool, const RowTableMetadata& metadata);
+ /// \brief Clear all rows from the table
+ ///
+ /// Does not shrink buffers
+ void Clean();
+ /// \brief Add empty rows
+ /// \param num_rows_to_append The number of empty rows to append
+ /// \param num_extra_bytes_to_append For tables storing variable-length data
this
+ /// should be a guess of how many data bytes will be needed to populate
the
+ /// data. This is ignored if there are no variable-length columns
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t
num_extra_bytes_to_append);
+ /// \brief Append rows from a source table
+ /// \param from The table to append from
+ /// \param num_rows_to_append The number of rows to append
+ /// \param source_row_ids Indices (into `from`) of the desired rows
+ Status AppendSelectionFrom(const RowTableImpl& from, uint32_t
num_rows_to_append,
+ const uint16_t* source_row_ids);
+ /// \brief Metadata describing the data stored in this table
+ const RowTableMetadata& metadata() const { return metadata_; }
+ /// \brief The number of rows stored in the table
+ int64_t length() const { return num_rows_; }
+ // Accessors into the table's buffers
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint32_t* offsets() const { return reinterpret_cast<const
uint32_t*>(data(1)); }
+ uint32_t* mutable_offsets() { return
reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint8_t* null_masks() const { return null_masks_->data(); }
+ uint8_t* null_masks() { return null_masks_->mutable_data(); }
+
+ /// \brief True if there is a null value anywhere in the table
+ ///
+ /// This calculation is memoized based on the number of rows and assumes
+ /// that values are only appended (and not modified in place) between
+ /// successive calls
+ bool has_any_nulls(const LightContext* ctx) const;
+
+ private:
+ Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
+ Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
+
+ // Helper functions to determine the number of bytes needed for each
+ // buffer given a number of rows.
+ int64_t size_null_masks(int64_t num_rows);
+ int64_t size_offsets(int64_t num_rows);
+ int64_t size_rows_fixed_length(int64_t num_rows);
+ int64_t size_rows_varying_length(int64_t num_bytes);
+ // Called after resize to fix pointers
+ void update_buffer_pointers();
+
+ static constexpr int64_t padding_for_vectors = 64;
+ MemoryPool* pool_;
+ RowTableMetadata metadata_;
+ // Buffers can only expand during lifetime and never shrink.
+ std::unique_ptr<ResizableBuffer> null_masks_;
+ // Only used if the table has variable-length columns
+ // Stores the offsets into the binary data
+ std::unique_ptr<ResizableBuffer> offsets_;
+ // Stores the fixed-length parts of the rows
+ std::unique_ptr<ResizableBuffer> rows_;
+ static constexpr int max_buffers_ = 3;
Review Comment:
```suggestion
static constexpr int kMaxBuffers = 3;
```
##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+/// Description of the data stored in a RowTable
+struct ARROW_EXPORT RowTableMetadata {
+ /// \brief True if there are no variable length columns in the table
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key
columns,
+ /// including lengths of varying-length columns, rounded up to the multiple
of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the
previous
+ /// varbinary field needs to be rounded up according to the specified
alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are
encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at an offset aligned to that number of
bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// choosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
string_alignment,
+ const KeyColumnMetadata&
col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// \brief An array of mutable offsets within a row of ends of varbinary
fields.
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary
field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t*
offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further
varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int
varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return
column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return
column_offsets[icol]; }
+
+ uint32_t num_cols() const { return
static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ /// \brief Populate this instance to describe `cols` with the given alignment
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ /// \brief True if `other` has the same number of columns
+ /// and each column has the same width (two variable length
+ /// columns are considered to have the same width)
+ bool is_compatible(const RowTableMetadata& other) const;
+};
+
+/// \brief A table of data stored in row-major order
+///
+/// Can only store non-nested data types
+///
+/// Can store both fixed-size data types and variable-length data types
+class ARROW_EXPORT RowTableImpl {
+ public:
+ RowTableImpl();
+ /// \brief Initialize a row array for use
+ ///
+ /// This must be called before any other method
+ Status Init(MemoryPool* pool, const RowTableMetadata& metadata);
+ /// \brief Clear all rows from the table
+ ///
+ /// Does not shrink buffers
+ void Clean();
+ /// \brief Add empty rows
+ /// \param num_rows_to_append The number of empty rows to append
+ /// \param num_extra_bytes_to_append For tables storing variable-length data
this
+ /// should be a guess of how many data bytes will be needed to populate
the
+ /// data. This is ignored if there are no variable-length columns
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t
num_extra_bytes_to_append);
+ /// \brief Append rows from a source table
+ /// \param from The table to append from
+ /// \param num_rows_to_append The number of rows to append
+ /// \param source_row_ids Indices (into `from`) of the desired rows
+ Status AppendSelectionFrom(const RowTableImpl& from, uint32_t
num_rows_to_append,
+ const uint16_t* source_row_ids);
+ /// \brief Metadata describing the data stored in this table
+ const RowTableMetadata& metadata() const { return metadata_; }
+ /// \brief The number of rows stored in the table
+ int64_t length() const { return num_rows_; }
+ // Accessors into the table's buffers
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint32_t* offsets() const { return reinterpret_cast<const
uint32_t*>(data(1)); }
+ uint32_t* mutable_offsets() { return
reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint8_t* null_masks() const { return null_masks_->data(); }
+ uint8_t* null_masks() { return null_masks_->mutable_data(); }
+
+ /// \brief True if there is a null value anywhere in the table
+ ///
+ /// This calculation is memoized based on the number of rows and assumes
+ /// that values are only appended (and not modified in place) between
+ /// successive calls
+ bool has_any_nulls(const LightContext* ctx) const;
+
+ private:
+ Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
+ Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
+
+ // Helper functions to determine the number of bytes needed for each
+ // buffer given a number of rows.
+ int64_t size_null_masks(int64_t num_rows);
+ int64_t size_offsets(int64_t num_rows);
+ int64_t size_rows_fixed_length(int64_t num_rows);
+ int64_t size_rows_varying_length(int64_t num_bytes);
+ // Called after resize to fix pointers
+ void update_buffer_pointers();
+
+ static constexpr int64_t padding_for_vectors = 64;
+ MemoryPool* pool_;
+ RowTableMetadata metadata_;
+ // Buffers can only expand during lifetime and never shrink.
+ std::unique_ptr<ResizableBuffer> null_masks_;
+ // Only used if the table has variable-length columns
+ // Stores the offsets into the binary data
Review Comment:
Where is the binary data stored?
##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+/// Description of the data stored in a RowTable
+struct ARROW_EXPORT RowTableMetadata {
+ /// \brief True if there are no variable length columns in the table
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key
columns,
+ /// including lengths of varying-length columns, rounded up to the multiple
of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the
previous
+ /// varbinary field needs to be rounded up according to the specified
alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are
encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at an offset aligned to that number of
bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// choosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
string_alignment,
+ const KeyColumnMetadata&
col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// \brief An array of mutable offsets within a row of ends of varbinary
fields.
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary
field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t*
offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further
varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int
varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return
column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return
column_offsets[icol]; }
+
+ uint32_t num_cols() const { return
static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ /// \brief Populate this instance to describe `cols` with the given alignment
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ /// \brief True if `other` has the same number of columns
+ /// and each column has the same width (two variable length
+ /// columns are considered to have the same width)
+ bool is_compatible(const RowTableMetadata& other) const;
+};
+
+/// \brief A table of data stored in row-major order
+///
+/// Can only store non-nested data types
+///
+/// Can store both fixed-size data types and variable-length data types
+class ARROW_EXPORT RowTableImpl {
+ public:
+ RowTableImpl();
+ /// \brief Initialize a row array for use
+ ///
+ /// This must be called before any other method
+ Status Init(MemoryPool* pool, const RowTableMetadata& metadata);
+ /// \brief Clear all rows from the table
+ ///
+ /// Does not shrink buffers
+ void Clean();
+ /// \brief Add empty rows
+ /// \param num_rows_to_append The number of empty rows to append
+ /// \param num_extra_bytes_to_append For tables storing variable-length data
this
+ /// should be a guess of how many data bytes will be needed to populate
the
+ /// data. This is ignored if there are no variable-length columns
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t
num_extra_bytes_to_append);
+ /// \brief Append rows from a source table
+ /// \param from The table to append from
+ /// \param num_rows_to_append The number of rows to append
+ /// \param source_row_ids Indices (into `from`) of the desired rows
+ Status AppendSelectionFrom(const RowTableImpl& from, uint32_t
num_rows_to_append,
+ const uint16_t* source_row_ids);
+ /// \brief Metadata describing the data stored in this table
+ const RowTableMetadata& metadata() const { return metadata_; }
+ /// \brief The number of rows stored in the table
+ int64_t length() const { return num_rows_; }
+ // Accessors into the table's buffers
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint32_t* offsets() const { return reinterpret_cast<const
uint32_t*>(data(1)); }
+ uint32_t* mutable_offsets() { return
reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint8_t* null_masks() const { return null_masks_->data(); }
+ uint8_t* null_masks() { return null_masks_->mutable_data(); }
+
+ /// \brief True if there is a null value anywhere in the table
+ ///
+ /// This calculation is memoized based on the number of rows and assumes
+ /// that values are only appended (and not modified in place) between
+ /// successive calls
+ bool has_any_nulls(const LightContext* ctx) const;
+
+ private:
+ Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
+ Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
+
+ // Helper functions to determine the number of bytes needed for each
+ // buffer given a number of rows.
+ int64_t size_null_masks(int64_t num_rows);
+ int64_t size_offsets(int64_t num_rows);
+ int64_t size_rows_fixed_length(int64_t num_rows);
+ int64_t size_rows_varying_length(int64_t num_bytes);
+ // Called after resize to fix pointers
+ void update_buffer_pointers();
+
+ static constexpr int64_t padding_for_vectors = 64;
+ MemoryPool* pool_;
+ RowTableMetadata metadata_;
+ // Buffers can only expand during lifetime and never shrink.
+ std::unique_ptr<ResizableBuffer> null_masks_;
+ // Only used if the table has variable-length columns
+ // Stores the offsets into the binary data
+ std::unique_ptr<ResizableBuffer> offsets_;
+ // Stores the fixed-length parts of the rows
+ std::unique_ptr<ResizableBuffer> rows_;
+ static constexpr int max_buffers_ = 3;
+ const uint8_t* buffers_[max_buffers_];
+ uint8_t* mutable_buffers_[max_buffers_];
+ // The number of rows in the table
+ int64_t num_rows_;
+ // The number of rows that can be stored in the table without resizing
+ int64_t rows_capacity_;
+ // The number of bytes that can be stored in the table without resizing
+ int64_t bytes_capacity_;
+
+ // Mutable to allow lazy evaluation
Review Comment:
Should these be atomic or is the row table not thread safe?
##########
cpp/src/arrow/compute/row/row_internal.h:
##########
@@ -0,0 +1,244 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/light_array.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+/// Description of the data stored in a RowTable
+struct ARROW_EXPORT RowTableMetadata {
+ /// \brief True if there are no variable length columns in the table
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key
columns,
+ /// including lengths of varying-length columns, rounded up to the multiple
of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the
previous
+ /// varbinary field needs to be rounded up according to the specified
alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are
encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at an offset aligned to that number of
bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// choosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int
string_alignment,
+ const KeyColumnMetadata&
col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// \brief An array of mutable offsets within a row of ends of varbinary
fields.
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary
field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t*
offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further
varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int
varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return
column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return
column_offsets[icol]; }
+
+ uint32_t num_cols() const { return
static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ /// \brief Populate this instance to describe `cols` with the given alignment
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ /// \brief True if `other` has the same number of columns
+ /// and each column has the same width (two variable length
+ /// columns are considered to have the same width)
+ bool is_compatible(const RowTableMetadata& other) const;
+};
+
+/// \brief A table of data stored in row-major order
+///
+/// Can only store non-nested data types
+///
+/// Can store both fixed-size data types and variable-length data types
+class ARROW_EXPORT RowTableImpl {
+ public:
+ RowTableImpl();
+ /// \brief Initialize a row array for use
+ ///
+ /// This must be called before any other method
+ Status Init(MemoryPool* pool, const RowTableMetadata& metadata);
+ /// \brief Clear all rows from the table
+ ///
+ /// Does not shrink buffers
+ void Clean();
+ /// \brief Add empty rows
+ /// \param num_rows_to_append The number of empty rows to append
+ /// \param num_extra_bytes_to_append For tables storing variable-length data
this
+ /// should be a guess of how many data bytes will be needed to populate
the
+ /// data. This is ignored if there are no variable-length columns
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t
num_extra_bytes_to_append);
+ /// \brief Append rows from a source table
+ /// \param from The table to append from
+ /// \param num_rows_to_append The number of rows to append
+ /// \param source_row_ids Indices (into `from`) of the desired rows
+ Status AppendSelectionFrom(const RowTableImpl& from, uint32_t
num_rows_to_append,
+ const uint16_t* source_row_ids);
+ /// \brief Metadata describing the data stored in this table
+ const RowTableMetadata& metadata() const { return metadata_; }
+ /// \brief The number of rows stored in the table
+ int64_t length() const { return num_rows_; }
+ // Accessors into the table's buffers
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint32_t* offsets() const { return reinterpret_cast<const
uint32_t*>(data(1)); }
+ uint32_t* mutable_offsets() { return
reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint8_t* null_masks() const { return null_masks_->data(); }
+ uint8_t* null_masks() { return null_masks_->mutable_data(); }
+
+ /// \brief True if there is a null value anywhere in the table
+ ///
+ /// This calculation is memoized based on the number of rows and assumes
+ /// that values are only appended (and not modified in place) between
+ /// successive calls
+ bool has_any_nulls(const LightContext* ctx) const;
+
+ private:
+ Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
+ Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
+
+ // Helper functions to determine the number of bytes needed for each
+ // buffer given a number of rows.
+ int64_t size_null_masks(int64_t num_rows);
+ int64_t size_offsets(int64_t num_rows);
+ int64_t size_rows_fixed_length(int64_t num_rows);
+ int64_t size_rows_varying_length(int64_t num_bytes);
+ // Called after resize to fix pointers
+ void update_buffer_pointers();
+
+ static constexpr int64_t padding_for_vectors = 64;
Review Comment:
Also add a comment explaining what this is?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]