Re: [PR] GH-48701: [C++][Parquet] Add ALPpd encoding [arrow]

via GitHub Wed, 03 Jun 2026 22:03:36 -0700


emkornfield commented on code in PR #48345:
URL: https://github.com/apache/arrow/pull/48345#discussion_r3353600350



##########
cpp/src/arrow/util/alp/alp_codec.cc:
##########
@@ -0,0 +1,552 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/alp/alp_codec.h"
+
+#include <cmath>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/alp/alp.h"
+#include "arrow/util/alp/alp_constants.h"
+#include "arrow/util/alp/alp_sampler.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+namespace alp {
+
+// ALP serialization uses memcpy for multi-byte integers (header fields,
+// offsets, frame_of_reference) and assumes little-endian byte order on disk.
+static_assert(ARROW_LITTLE_ENDIAN,
+              "ALP serialization assumes little-endian byte order");
+
+namespace {
+
+// ----------------------------------------------------------------------
+// AlpHeader
+
+/// \brief Header structure for ALP compression blocks
+///
+/// Contains page-level metadata for ALP compression. The num_elements field
+/// stores the total element count for the page, allowing per-vector element
+/// counts to be inferred (all vectors except the last have vector_size 
elements).
+///
+/// Note: num_elements is int32_t to match Parquet page headers (i32 for 
num_values).
+/// See: 
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
+///
+/// Note: log_vector_size stores the base-2 logarithm of the vector size.
+/// The actual vector size is computed as: 1u << log_vector_size (i.e., 
2^log_vector_size).
+/// For example, log_vector_size=10 means vector_size=1024.
+/// This allows representing any power-of-2 vector size up to 2^255 in a 
single byte.
+///
+/// Header format (7 bytes):
+///
+///   +---------------------------------------------------+
+///   |  AlpHeader (7 bytes)                              |
+///   +---------------------------------------------------+
+///   |  Offset |  Field              |  Size             |
+///   +---------+---------------------+-------------------+
+///   |    0    |  compression_mode   |  1 byte (uint8)   |
+///   |    1    |  integer_encoding   |  1 byte (uint8)   |
+///   |    2    |  log_vector_size    |  1 byte (uint8)   |
+///   |    3    |  num_elements       |  4 bytes (int32)  |
+///   +---------------------------------------------------+
+///
+/// Page-level layout (offset-based interleaved for O(1) random access):
+///
+///   +-------------------------------------------------------------------+
+///   |  [AlpHeader (7B)]                                                 |
+///   |  [Offset₀ | Offset₁ | ... | Offsetₙ₋₁]       ← Vector offsets     |
+///   |  [Vector₀][Vector₁]...[Vectorₙ₋₁]            ← Interleaved data   |
+///   +-------------------------------------------------------------------+
+///   where each Vector = [AlpInfo | ForInfo | Data]
+///
+/// This layout enables O(1) random access to any vector by:
+/// 1. Reading the offset for target vector (direct lookup)
+/// 2. Jumping to that offset to read metadata + data together
+struct AlpHeader {
+  /// Compression mode (currently only kAlp is supported).
+  uint8_t compression_mode = static_cast<uint8_t>(AlpMode::kAlp);
+  /// Integer encoding method used (currently only kForBitPack is supported).
+  uint8_t integer_encoding = 
static_cast<uint8_t>(AlpIntegerEncoding::kForBitPack);
+  /// Log base 2 of vector size. Actual vector size = 1u << log_vector_size.
+  /// For example: 10 means 2^10 = 1024 elements per vector.
+  uint8_t log_vector_size = 0;
+  /// Total number of elements in the page (int32_t to match Parquet's i32 
num_values).
+  /// Per-vector element count is inferred: vector_size for all but the last 
vector.
+  int32_t num_elements = 0;
+
+  /// Size of the serialized header in bytes.
+  static constexpr size_t kSize = 7;
+
+  /// \brief Calculate the number of vectors from total elements and vector 
size
+  ///
+  /// \return number of vectors (full + partial if any)
+  int32_t GetNumVectors() const {
+    const int32_t vector_size = GetVectorSize();
+    return (num_elements + vector_size - 1) / vector_size;
+  }
+
+  /// \brief Get the size of the offsets section
+  ///
+  /// \return size in bytes of the offsets array (num_vectors * 
sizeof(OffsetType))
+  int64_t GetOffsetsSectionSize() const {
+    return static_cast<int64_t>(GetNumVectors()) * 
sizeof(AlpConstants::OffsetType);
+  }
+
+  /// \brief Compute the actual vector size from log_vector_size
+  ///
+  /// \return the vector size (2^log_vector_size)
+  int32_t GetVectorSize() const { return 1 << log_vector_size; }
+
+  /// \brief Compute log base 2 of a power-of-2 value
+  ///
+  /// \param[in] value a power-of-2 value
+  /// \return the log base 2 of value
+  static uint8_t Log2(int32_t value) {
+    ARROW_CHECK(value > 0 && (value & (value - 1)) == 0)
+        << "value_must_be_power_of_2: " << value;
+    return static_cast<uint8_t>(__builtin_ctz(static_cast<unsigned>(value)));
+  }
+
+  /// \brief Calculate the number of elements for a given vector index
+  ///
+  /// \param[in] vector_index the 0-based index of the vector
+  /// \return the number of elements in this vector, or error if index is out 
of range
+  Result<int32_t> GetVectorNumElements(int32_t vector_index) const {
+    const int32_t vector_size = GetVectorSize();
+    const int32_t num_full_vectors = num_elements / vector_size;
+    const int32_t remainder = num_elements % vector_size;
+    if (vector_index < num_full_vectors) {
+      return vector_size;  // Full vector
+    } else if (vector_index == num_full_vectors && remainder > 0) {
+      return remainder;  // Last partial vector
+    }
+    return Status::Invalid("ALP invalid vector index: ", vector_index,
+                           " (num_vectors=", GetNumVectors(), ")");
+  }
+
+  /// \brief Get the AlpMode enum from the stored uint8_t
+  AlpMode GetCompressionMode() const {
+    return static_cast<AlpMode>(compression_mode);
+  }
+
+  /// \brief Get the AlpIntegerEncoding enum from the stored uint8_t
+  AlpIntegerEncoding GetIntegerEncoding() const {
+    return static_cast<AlpIntegerEncoding>(integer_encoding);
+  }
+};
+
+}  // namespace
+
+// ----------------------------------------------------------------------
+// AlpCodec::AlpHeader definition
+
+template <typename T>
+struct AlpCodec<T>::AlpHeader : public ::arrow::util::alp::AlpHeader {
+};
+
+// ----------------------------------------------------------------------
+// AlpCodec implementation
+
+template <typename T>
+auto AlpCodec<T>::LoadHeader(const char* input, int64_t input_size)

Review Comment:
   is auto needed here? It can just return AlpHeader directly?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-48701: [C++][Parquet] Add ALPpd encoding [arrow]

Reply via email to