Re: [PR] GH-48701: [C++][Parquet] Add ALPpd encoding [arrow]

via GitHub Wed, 13 May 2026 14:33:56 -0700


prtkgaur commented on code in PR #48345:
URL: https://github.com/apache/arrow/pull/48345#discussion_r3237577669



##########
cpp/src/arrow/util/alp/alp_codec.cc:
##########
@@ -0,0 +1,532 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/alp/alp_wrapper.h"
+
+#include <cmath>
+#include <optional>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/alp/alp.h"
+#include "arrow/util/alp/alp_constants.h"
+#include "arrow/util/alp/alp_sampler.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+namespace alp {
+
+// ALP serialization uses memcpy for multi-byte integers (header fields,
+// offsets, frame_of_reference) and assumes little-endian byte order on disk.
+static_assert(ARROW_LITTLE_ENDIAN,
+              "ALP serialization assumes little-endian byte order");
+
+namespace {
+
+// ----------------------------------------------------------------------
+// AlpHeader
+
+/// \brief Header structure for ALP compression blocks
+///
+/// Contains page-level metadata for ALP compression. The num_elements field
+/// stores the total element count for the page, allowing per-vector element
+/// counts to be inferred (all vectors except the last have vector_size 
elements).
+///
+/// Note: num_elements is uint32_t because Parquet page headers use i32 for 
num_values.
+/// See: 
https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
+///
+/// Note: log_vector_size stores the base-2 logarithm of the vector size.
+/// The actual vector size is computed as: 1u << log_vector_size (i.e., 
2^log_vector_size).
+/// For example, log_vector_size=10 means vector_size=1024.
+/// This allows representing any power-of-2 vector size up to 2^255 in a 
single byte.
+///
+/// Header format (7 bytes):
+///
+///   +---------------------------------------------------+
+///   |  AlpHeader (7 bytes)                              |
+///   +---------------------------------------------------+
+///   |  Offset |  Field              |  Size             |
+///   +---------+---------------------+-------------------+
+///   |    0    |  compression_mode   |  1 byte (uint8)   |
+///   |    1    |  integer_encoding   |  1 byte (uint8)   |
+///   |    2    |  log_vector_size    |  1 byte (uint8)   |
+///   |    3    |  num_elements       |  4 bytes (uint32) |
+///   +---------------------------------------------------+
+///
+/// Page-level layout (offset-based interleaved for O(1) random access):
+///
+///   +-------------------------------------------------------------------+
+///   |  [AlpHeader (7B)]                                                 |
+///   |  [Offset₀ | Offset₁ | ... | Offsetₙ₋₁]       ← Vector offsets     |
+///   |  [Vector₀][Vector₁]...[Vectorₙ₋₁]            ← Interleaved data   |
+///   +-------------------------------------------------------------------+
+///   where each Vector = [AlpInfo | ForInfo | Data]
+///
+/// This layout enables O(1) random access to any vector by:
+/// 1. Reading the offset for target vector (direct lookup)
+/// 2. Jumping to that offset to read metadata + data together
+struct AlpHeader {
+  /// Compression mode (currently only kAlp is supported).
+  uint8_t compression_mode = static_cast<uint8_t>(AlpMode::kAlp);
+  /// Integer encoding method used (currently only kForBitPack is supported).
+  uint8_t integer_encoding = 
static_cast<uint8_t>(AlpIntegerEncoding::kForBitPack);
+  /// Log base 2 of vector size. Actual vector size = 1u << log_vector_size.
+  /// For example: 10 means 2^10 = 1024 elements per vector.
+  uint8_t log_vector_size = 0;
+  /// Total number of elements in the page (uint32_t since Parquet uses i32).
+  /// Per-vector element count is inferred: vector_size for all but the last 
vector.
+  uint32_t num_elements = 0;
+
+  /// Size of the serialized header in bytes.
+  static constexpr size_t kSize = 7;
+
+  /// \brief Calculate the number of vectors from total elements and vector 
size
+  ///
+  /// \return number of vectors (full + partial if any)
+  uint32_t GetNumVectors() const {
+    const uint32_t vector_size = GetVectorSize();
+    return (num_elements + vector_size - 1) / vector_size;
+  }
+
+  /// \brief Get the size of the offsets section
+  ///
+  /// \return size in bytes of the offsets array (num_vectors * 
sizeof(OffsetType))
+  uint64_t GetOffsetsSectionSize() const {
+    return static_cast<uint64_t>(GetNumVectors()) * 
sizeof(AlpConstants::OffsetType);
+  }
+
+  /// \brief Compute the actual vector size from log_vector_size
+  ///
+  /// \return the vector size (2^log_vector_size)
+  uint32_t GetVectorSize() const { return 1u << log_vector_size; }
+
+  /// \brief Compute log base 2 of a power-of-2 value
+  ///
+  /// \param[in] value a power-of-2 value
+  /// \return the log base 2 of value
+  static uint8_t Log2(uint32_t value) {
+    ARROW_CHECK(value > 0 && (value & (value - 1)) == 0)
+        << "value_must_be_power_of_2: " << value;
+    return static_cast<uint8_t>(__builtin_ctz(value));
+  }
+
+  /// \brief Calculate the number of elements for a given vector index
+  ///
+  /// \param[in] vector_index the 0-based index of the vector
+  /// \return the number of elements in this vector
+  uint16_t GetVectorNumElements(uint64_t vector_index) const {
+    const uint32_t vector_size = GetVectorSize();
+    const uint64_t num_full_vectors = num_elements / vector_size;
+    const uint64_t remainder = num_elements % vector_size;
+    if (vector_index < num_full_vectors) {
+      return static_cast<uint16_t>(vector_size);  // Full vector
+    } else if (vector_index == num_full_vectors && remainder > 0) {
+      return static_cast<uint16_t>(remainder);  // Last partial vector
+    }
+    ARROW_CHECK(false) << "alp_invalid_vector_index: " << vector_index
+                       << " (num_vectors=" << GetNumVectors() << ")";
+    return 0;  // Unreachable, but silences compiler warning
+  }
+
+  /// \brief Get the AlpMode enum from the stored uint8_t
+  AlpMode GetCompressionMode() const {
+    return static_cast<AlpMode>(compression_mode);
+  }
+
+  /// \brief Get the AlpIntegerEncoding enum from the stored uint8_t
+  AlpIntegerEncoding GetIntegerEncoding() const {
+    return static_cast<AlpIntegerEncoding>(integer_encoding);
+  }
+};
+
+}  // namespace
+
+// ----------------------------------------------------------------------
+// AlpCodec::AlpHeader definition
+
+template <typename T>
+struct AlpCodec<T>::AlpHeader : public ::arrow::util::alp::AlpHeader {
+};
+
+// ----------------------------------------------------------------------
+// AlpCodec implementation
+
+template <typename T>
+auto AlpCodec<T>::LoadHeader(const char* comp, size_t comp_size)
+    -> Result<AlpHeader> {
+  if (comp_size < AlpHeader::kSize) {
+    return Status::Invalid("ALP compressed buffer too small for header: ", 
comp_size,
+                           " < ", AlpHeader::kSize);
+  }
+  AlpHeader header{};
+  std::memcpy(&header.compression_mode, comp, 3);
+  std::memcpy(&header.num_elements, comp + 3, sizeof(header.num_elements));
+  return header;
+}
+
+template <typename T>
+auto AlpCodec<T>::CreateSamplingPreset(const T* decomp, size_t decomp_size)
+    -> AlpSamplerResult {
+  ARROW_CHECK(decomp_size % sizeof(T) == 0) << 
"alp_encode_input_must_be_multiple_of_T";
+  const uint64_t element_count = decomp_size / sizeof(T);
+
+  AlpSampler<T> sampler;
+  sampler.AddSample({decomp, element_count});
+  return sampler.Finalize();
+}
+
+template <typename T>
+void AlpCodec<T>::EncodeWithPreset(const T* decomp, size_t decomp_size, char* 
comp,
+                                     size_t* comp_size, const 
AlpSamplerResult& preset) {
+  ARROW_CHECK(decomp_size % sizeof(T) == 0) << 
"alp_encode_input_must_be_multiple_of_T";
+  const uint64_t element_count = decomp_size / sizeof(T);
+
+  // Make room to store header afterwards.
+  char* encoded_header = comp;
+  comp += AlpHeader::kSize;
+  const uint64_t remaining_compressed_size = *comp_size - AlpHeader::kSize;
+
+  const CompressionProgress compression_progress =
+      EncodeAlp(decomp, element_count, comp, remaining_compressed_size,
+                preset.alp_parameters);
+
+  AlpHeader header{};
+  header.compression_mode = static_cast<uint8_t>(AlpMode::kAlp);
+  header.integer_encoding = 
static_cast<uint8_t>(AlpIntegerEncoding::kForBitPack);
+  header.log_vector_size = AlpHeader::Log2(AlpConstants::kAlpVectorSize);

Review Comment:
   I think ALP will really benefit from having the capability to support 
different vector sizes (depending on the data set).
   
   
   I have removed the hardcoded kAlpVectorSize constraint from the decode path. 
The implementation now accepts any valid power-of-2 vector size up to 
2^kMaxLogVectorSize (32768). The encode path still writes with 
vector_size=1024, but we can decode pages written by other implementations with 
different vector sizes.                        
    
   The main change was replacing all StaticVector<T, kAlpVectorSize> 
(compile-time-sized stack buffers) with std::vector<T>, since at 
vector_size=32768 a single StaticVector would need 256KB+ of stack — not 
feasible. This affected 4 structs (AlpEncodedVector, AlpEncodedVectorView, 
EncodingResult,        BitPackingResult) and 8 local variables across the 
encode/decode paths. The validation bounds in Load/LoadView/LoadViewDataOnly 
were also updated from kAlpVectorSize to (1 << kMaxLogVectorSize) to match.
   
   Will add tests to test this too (should not be too tricky to do that)



##########
cpp/src/arrow/util/alp/alp.cc:
##########
@@ -0,0 +1,1035 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/alp/alp.h"
+
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <iostream>
+#include <map>
+
+#include "arrow/util/alp/alp_constants.h"
+#include "arrow/util/bit_stream_utils_internal.h"
+#include "arrow/util/bpacking_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/small_vector.h"
+#include "arrow/util/span.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace util {
+namespace alp {
+
+// ----------------------------------------------------------------------
+// AlpEncodedVectorInfo implementation (non-templated, 4 bytes)
+
+void AlpEncodedVectorInfo::Store(arrow::util::span<char> output_buffer) const {
+  ARROW_CHECK(output_buffer.size() >= GetStoredSize())
+      << "alp_vector_info_output_too_small: " << output_buffer.size() << " vs "
+      << GetStoredSize();
+
+  char* ptr = output_buffer.data();
+
+  // exponent, factor: 1 byte each
+  *ptr++ = static_cast<char>(exponent);
+  *ptr++ = static_cast<char>(factor);
+
+  // num_exceptions: 2 bytes
+  std::memcpy(ptr, &num_exceptions, sizeof(num_exceptions));
+}
+
+AlpEncodedVectorInfo AlpEncodedVectorInfo::Load(
+    arrow::util::span<const char> input_buffer) {
+  ARROW_CHECK(input_buffer.size() >= GetStoredSize())
+      << "alp_vector_info_input_too_small: " << input_buffer.size() << " vs "
+      << GetStoredSize();
+
+  AlpEncodedVectorInfo result{};
+  const char* ptr = input_buffer.data();
+
+  // exponent, factor: 1 byte each
+  result.exponent = static_cast<uint8_t>(*ptr++);
+  result.factor = static_cast<uint8_t>(*ptr++);
+
+  // num_exceptions: 2 bytes
+  std::memcpy(&result.num_exceptions, ptr, sizeof(result.num_exceptions));
+
+  return result;
+}
+
+// ----------------------------------------------------------------------
+// AlpEncodedForVectorInfo implementation (templated, 5/9 bytes)
+
+template <typename T>
+void AlpEncodedForVectorInfo<T>::Store(arrow::util::span<char> output_buffer) 
const {
+  ARROW_CHECK(output_buffer.size() >= GetStoredSize())
+      << "alp_for_vector_info_output_too_small: " << output_buffer.size() << " 
vs "
+      << GetStoredSize();
+
+  char* ptr = output_buffer.data();
+
+  // frame_of_reference: 4 bytes for float, 8 bytes for double
+  std::memcpy(ptr, &frame_of_reference, sizeof(frame_of_reference));
+  ptr += sizeof(frame_of_reference);
+
+  // bit_width: 1 byte
+  *ptr = static_cast<char>(bit_width);
+}
+
+template <typename T>
+AlpEncodedForVectorInfo<T> AlpEncodedForVectorInfo<T>::Load(
+    arrow::util::span<const char> input_buffer) {
+  ARROW_CHECK(input_buffer.size() >= GetStoredSize())
+      << "alp_for_vector_info_input_too_small: " << input_buffer.size() << " 
vs "
+      << GetStoredSize();
+
+  AlpEncodedForVectorInfo<T> result{};
+  const char* ptr = input_buffer.data();
+
+  // frame_of_reference: 4 bytes for float, 8 bytes for double
+  std::memcpy(&result.frame_of_reference, ptr, 
sizeof(result.frame_of_reference));
+  ptr += sizeof(result.frame_of_reference);
+
+  // bit_width: 1 byte
+  result.bit_width = static_cast<uint8_t>(*ptr);
+
+  return result;
+}
+
+// Explicit template instantiations for AlpEncodedForVectorInfo
+template struct AlpEncodedForVectorInfo<float>;
+template struct AlpEncodedForVectorInfo<double>;
+
+// ----------------------------------------------------------------------
+// AlpEncodedVector implementation
+
+template <typename T>
+void AlpEncodedVector<T>::Store(arrow::util::span<char> output_buffer) const {
+  const uint64_t overall_size = GetStoredSize();
+  ARROW_CHECK(output_buffer.size() >= overall_size)
+      << "alp_bit_packed_vector_store_output_too_small: " << 
output_buffer.size()
+      << " vs " << overall_size;
+
+  uint64_t offset = 0;
+
+  // Store AlpInfo (4 bytes)
+  alp_info.Store({output_buffer.data() + offset, 
AlpEncodedVectorInfo::kStoredSize});
+  offset += AlpEncodedVectorInfo::kStoredSize;
+
+  // Store ForInfo (6/10 bytes)
+  for_info.Store({output_buffer.data() + offset, 
AlpEncodedForVectorInfo<T>::kStoredSize});
+  offset += AlpEncodedForVectorInfo<T>::kStoredSize;
+
+  // Store data section
+  StoreDataOnly({output_buffer.data() + offset, output_buffer.size() - 
offset});
+}
+
+template <typename T>
+void AlpEncodedVector<T>::StoreDataOnly(arrow::util::span<char> output_buffer) 
const {
+  const uint64_t data_size = GetDataStoredSize();
+  ARROW_CHECK(output_buffer.size() >= data_size)
+      << "alp_bit_packed_vector_store_data_output_too_small: " << 
output_buffer.size()
+      << " vs " << data_size;
+
+  ARROW_CHECK(alp_info.num_exceptions == exceptions.size() &&
+              alp_info.num_exceptions == exception_positions.size())
+      << "alp_bit_packed_vector_store_num_exceptions_mismatch: "
+      << alp_info.num_exceptions << " vs " << exceptions.size() << " vs "
+      << exception_positions.size();
+
+  uint64_t offset = 0;
+
+  // Compute bit_packed_size from num_elements and bit_width
+  const uint64_t bit_packed_size =
+      AlpEncodedForVectorInfo<T>::GetBitPackedSize(num_elements, 
for_info.bit_width);
+
+  // Store all successfully compressed values first.
+  std::memcpy(output_buffer.data() + offset, packed_values.data(), 
bit_packed_size);
+  offset += bit_packed_size;
+
+  // Store exception positions.
+  const uint64_t exception_position_size =
+      alp_info.num_exceptions * sizeof(AlpConstants::PositionType);
+  std::memcpy(output_buffer.data() + offset, exception_positions.data(),
+              exception_position_size);
+  offset += exception_position_size;
+
+  // Store exception values.
+  const uint64_t exception_size = alp_info.num_exceptions * sizeof(T);
+  std::memcpy(output_buffer.data() + offset, exceptions.data(), 
exception_size);
+  offset += exception_size;
+
+  ARROW_CHECK(offset == data_size)
+      << "alp_bit_packed_vector_data_size_mismatch: " << offset << " vs " << 
data_size;
+}
+
+template <typename T>
+AlpEncodedVector<T> AlpEncodedVector<T>::Load(
+    arrow::util::span<const char> input_buffer, uint16_t num_elements) {
+  ARROW_CHECK(num_elements <= AlpConstants::kAlpVectorSize)
+      << "alp_compression_state_element_count_too_large: " << num_elements << 
" vs "
+      << AlpConstants::kAlpVectorSize;
+
+  AlpEncodedVector<T> result;
+  uint64_t input_offset = 0;
+
+  // Load AlpInfo (4 bytes)
+  result.alp_info = AlpEncodedVectorInfo::Load(
+      {input_buffer.data() + input_offset, AlpEncodedVectorInfo::kStoredSize});
+  input_offset += AlpEncodedVectorInfo::kStoredSize;
+
+  // Load ForInfo (6/10 bytes)
+  result.for_info = AlpEncodedForVectorInfo<T>::Load(
+      {input_buffer.data() + input_offset, 
AlpEncodedForVectorInfo<T>::kStoredSize});
+  input_offset += AlpEncodedForVectorInfo<T>::kStoredSize;
+
+  result.num_elements = num_elements;
+
+  const uint64_t overall_size =
+      GetStoredSize(result.alp_info, result.for_info, num_elements);
+
+  ARROW_CHECK(input_buffer.size() >= overall_size)
+      << "alp_compression_state_input_too_small: " << input_buffer.size() << " 
vs "
+      << overall_size;
+
+  // Compute bit_packed_size from num_elements and bit_width
+  const uint64_t bit_packed_size =
+      AlpEncodedForVectorInfo<T>::GetBitPackedSize(num_elements, 
result.for_info.bit_width);
+
+  // Optimization: Use UnsafeResize to avoid zero-initialization before memcpy.
+  // This is safe for POD types since we immediately overwrite with memcpy.
+  result.packed_values.UnsafeResize(bit_packed_size);
+  std::memcpy(result.packed_values.data(), input_buffer.data() + input_offset,
+              bit_packed_size);
+  input_offset += bit_packed_size;
+
+  result.exception_positions.UnsafeResize(result.alp_info.num_exceptions);
+  const uint64_t exception_position_size =
+      result.alp_info.num_exceptions * sizeof(AlpConstants::PositionType);
+  std::memcpy(result.exception_positions.data(), input_buffer.data() + 
input_offset,
+              exception_position_size);
+  input_offset += exception_position_size;
+
+  result.exceptions.UnsafeResize(result.alp_info.num_exceptions);
+  const uint64_t exception_size = result.alp_info.num_exceptions * sizeof(T);
+  std::memcpy(result.exceptions.data(), input_buffer.data() + input_offset,
+              exception_size);
+  return result;
+}
+
+template <typename T>
+uint64_t AlpEncodedVector<T>::GetStoredSize() const {
+  return GetStoredSize(alp_info, for_info, num_elements);
+}
+
+template <typename T>
+uint64_t AlpEncodedVector<T>::GetStoredSize(const AlpEncodedVectorInfo& 
alp_info,
+                                            const AlpEncodedForVectorInfo<T>& 
for_info,
+                                            uint16_t num_elements) {
+  const uint64_t bit_packed_size =
+      AlpEncodedForVectorInfo<T>::GetBitPackedSize(num_elements, 
for_info.bit_width);
+  return AlpEncodedVectorInfo::kStoredSize + 
AlpEncodedForVectorInfo<T>::kStoredSize +
+         bit_packed_size +
+         alp_info.num_exceptions * (sizeof(AlpConstants::PositionType) + 
sizeof(T));
+}
+
+template <typename T>
+bool AlpEncodedVector<T>::operator==(const AlpEncodedVector<T>& other) const {
+  if (alp_info != other.alp_info || for_info != other.for_info ||
+      num_elements != other.num_elements) {
+    return false;
+  }
+  if (packed_values.size() != other.packed_values.size() ||
+      !std::equal(packed_values.begin(), packed_values.end(), 
other.packed_values.begin())) {
+    return false;
+  }
+  if (exceptions.size() != other.exceptions.size() ||
+      !std::equal(exceptions.begin(), exceptions.end(), 
other.exceptions.begin())) {
+    return false;
+  }
+  if (exception_positions.size() != other.exception_positions.size() ||
+      !std::equal(exception_positions.begin(), exception_positions.end(),
+                  other.exception_positions.begin())) {
+    return false;
+  }
+  return true;
+}
+
+// ----------------------------------------------------------------------
+// AlpEncodedVectorView implementation
+
+template <typename T>
+AlpEncodedVectorView<T> AlpEncodedVectorView<T>::LoadView(
+    arrow::util::span<const char> input_buffer, uint16_t num_elements) {
+  ARROW_CHECK(num_elements <= AlpConstants::kAlpVectorSize)
+      << "alp_view_element_count_too_large: " << num_elements << " vs "
+      << AlpConstants::kAlpVectorSize;
+
+  AlpEncodedVectorView<T> result;
+  uint64_t input_offset = 0;
+
+  // Load AlpInfo (4 bytes)
+  result.alp_info = AlpEncodedVectorInfo::Load(
+      {input_buffer.data() + input_offset, AlpEncodedVectorInfo::kStoredSize});
+  input_offset += AlpEncodedVectorInfo::kStoredSize;
+
+  // Load ForInfo (6/10 bytes)
+  result.for_info = AlpEncodedForVectorInfo<T>::Load(
+      {input_buffer.data() + input_offset, 
AlpEncodedForVectorInfo<T>::kStoredSize});
+  input_offset += AlpEncodedForVectorInfo<T>::kStoredSize;
+
+  result.num_elements = num_elements;
+
+  const uint64_t overall_size =
+      AlpEncodedVector<T>::GetStoredSize(result.alp_info, result.for_info, 
num_elements);
+
+  ARROW_CHECK(input_buffer.size() >= overall_size)
+      << "alp_view_input_too_small: " << input_buffer.size() << " vs " << 
overall_size;
+
+  // Load data section (after metadata)
+  AlpEncodedVectorView<T> data_view = LoadViewDataOnly(
+      {input_buffer.data() + input_offset, input_buffer.size() - input_offset},
+      result.alp_info, result.for_info, num_elements);
+
+  // Copy the loaded data into result
+  result.packed_values = data_view.packed_values;
+  result.exception_positions = std::move(data_view.exception_positions);
+  result.exceptions = std::move(data_view.exceptions);
+
+  return result;
+}
+
+template <typename T>
+AlpEncodedVectorView<T> AlpEncodedVectorView<T>::LoadViewDataOnly(
+    arrow::util::span<const char> input_buffer, const AlpEncodedVectorInfo& 
alp_info,
+    const AlpEncodedForVectorInfo<T>& for_info, uint16_t num_elements) {
+  ARROW_CHECK(num_elements <= AlpConstants::kAlpVectorSize)
+      << "alp_view_data_only_element_count_too_large: " << num_elements << " 
vs "
+      << AlpConstants::kAlpVectorSize;
+
+  AlpEncodedVectorView<T> result;
+  result.alp_info = alp_info;
+  result.for_info = for_info;
+  result.num_elements = num_elements;
+
+  const uint64_t data_size = for_info.GetDataStoredSize(num_elements, 
alp_info.num_exceptions);
+  ARROW_CHECK(input_buffer.size() >= data_size)
+      << "alp_view_data_only_input_too_small: " << input_buffer.size() << " vs 
"
+      << data_size;
+
+  uint64_t input_offset = 0;
+
+  // Compute bit_packed_size from num_elements and bit_width
+  const uint64_t bit_packed_size =
+      AlpEncodedForVectorInfo<T>::GetBitPackedSize(num_elements, 
for_info.bit_width);
+
+  // Zero-copy for packed values (bytes have no alignment requirements)
+  result.packed_values = {
+      reinterpret_cast<const uint8_t*>(input_buffer.data() + input_offset),
+      bit_packed_size};
+  input_offset += bit_packed_size;
+
+  // Copy exception positions into aligned storage to avoid UB from misaligned 
access.
+  // Exceptions are rare (typically < 5%), so the copy overhead is negligible.
+  const uint64_t exception_position_size =
+      alp_info.num_exceptions * sizeof(AlpConstants::PositionType);
+  result.exception_positions.UnsafeResize(alp_info.num_exceptions);
+  std::memcpy(result.exception_positions.data(), input_buffer.data() + 
input_offset,
+              exception_position_size);
+  input_offset += exception_position_size;
+
+  // Copy exception values into aligned storage to avoid UB from misaligned 
access.
+  const uint64_t exception_size = alp_info.num_exceptions * sizeof(T);
+  result.exceptions.UnsafeResize(alp_info.num_exceptions);
+  std::memcpy(result.exceptions.data(), input_buffer.data() + input_offset,
+              exception_size);
+
+  return result;
+}
+
+template <typename T>
+uint64_t AlpEncodedVectorView<T>::GetStoredSize() const {
+  return AlpEncodedVector<T>::GetStoredSize(alp_info, for_info, num_elements);
+}
+
+template struct AlpEncodedVectorView<float>;
+template struct AlpEncodedVectorView<double>;
+
+// ----------------------------------------------------------------------
+// AlpMetadataCache implementation
+
+template <typename T>
+AlpMetadataCache<T> AlpMetadataCache<T>::Load(
+    uint32_t num_vectors, uint32_t vector_size, uint32_t total_elements,
+    AlpIntegerEncoding integer_encoding,
+    arrow::util::span<const char> alp_metadata_buffer,
+    arrow::util::span<const char> int_encoding_metadata_buffer) {
+  AlpMetadataCache<T> cache;
+
+  if (num_vectors == 0) {
+    return cache;
+  }
+
+  const uint64_t alp_info_size = AlpEncodedVectorInfo::kStoredSize;
+  const uint64_t expected_alp_size = num_vectors * alp_info_size;
+
+  ARROW_CHECK(alp_metadata_buffer.size() >= expected_alp_size)
+      << "alp_metadata_cache_alp_buffer_too_small: " << 
alp_metadata_buffer.size()
+      << " vs " << expected_alp_size;
+
+  cache.alp_infos_.reserve(num_vectors);
+  cache.cumulative_data_offsets_.reserve(num_vectors);
+  cache.vector_num_elements_.reserve(num_vectors);
+
+  // Calculate number of full vectors and remainder
+  const uint32_t num_full_vectors = total_elements / vector_size;
+  const uint32_t remainder = total_elements % vector_size;
+
+  // Load integer encoding metadata based on encoding type
+  switch (integer_encoding) {
+    case AlpIntegerEncoding::kForBitPack: {
+      const uint64_t for_info_size = AlpEncodedForVectorInfo<T>::kStoredSize;
+      const uint64_t expected_for_size = num_vectors * for_info_size;
+      ARROW_CHECK(int_encoding_metadata_buffer.size() >= expected_for_size)
+          << "alp_metadata_cache_for_buffer_too_small: "
+          << int_encoding_metadata_buffer.size() << " vs " << 
expected_for_size;
+      cache.for_infos_.reserve(num_vectors);
+
+      uint64_t cumulative_offset = 0;
+      for (uint32_t i = 0; i < num_vectors; i++) {
+        // Load AlpInfo
+        const AlpEncodedVectorInfo alp_info = AlpEncodedVectorInfo::Load(
+            {alp_metadata_buffer.data() + i * alp_info_size, alp_info_size});
+        cache.alp_infos_.push_back(alp_info);
+
+        // Load ForInfo for kForBitPack encoding
+        const AlpEncodedForVectorInfo<T> for_info = 
AlpEncodedForVectorInfo<T>::Load(
+            {int_encoding_metadata_buffer.data() + i * for_info_size, 
for_info_size});
+        cache.for_infos_.push_back(for_info);
+
+        // Calculate number of elements for this vector
+        const uint16_t this_vector_elements =
+            (i < num_full_vectors) ? static_cast<uint16_t>(vector_size)
+                                   : static_cast<uint16_t>(remainder);
+        cache.vector_num_elements_.push_back(this_vector_elements);
+
+        // Store cumulative offset (offset to start of this vector's data)
+        cache.cumulative_data_offsets_.push_back(cumulative_offset);
+
+        // Advance offset by this vector's data size
+        cumulative_offset +=
+            for_info.GetDataStoredSize(this_vector_elements, 
alp_info.num_exceptions);
+      }
+      cache.total_data_size_ = cumulative_offset;
+    } break;
+
+    default:
+      ARROW_CHECK(false) << "unsupported_integer_encoding: "
+                         << static_cast<int>(integer_encoding);
+      break;
+  }
+
+  return cache;
+}
+
+template class AlpMetadataCache<float>;
+template class AlpMetadataCache<double>;
+
+template class AlpEncodedVector<float>;
+template class AlpEncodedVector<double>;
+
+// ----------------------------------------------------------------------
+// Internal helper classes
+
+namespace {
+
+/// \brief Helper class for encoding/decoding individual values
+template <typename T>
+class AlpInlines : private AlpConstants {
+ public:
+  using Constants = AlpTypedConstants<T>;
+  using ExactType = typename Constants::FloatingToExact;
+  using SignedExactType = typename Constants::FloatingToSignedExact;
+
+  /// \brief Check if float is a special value that cannot be converted
+  static inline bool IsImpossibleToEncode(const T n) {
+    // We do not have to check for positive or negative infinity, since
+    // std::numeric_limits<T>::infinity() > std::numeric_limits<T>::max()
+    // and vice versa for negative infinity.
+    return std::isnan(n) || n > Constants::kEncodingUpperLimit ||
+           n < Constants::kEncodingLowerLimit ||
+           (n == 0.0 && std::signbit(n));  // Verification for -0.0
+  }
+
+  /// \brief Convert a float to an int without rounding
+  static inline auto FastRound(T n) -> SignedExactType {
+    n = n + Constants::kMagicNumber - Constants::kMagicNumber;
+    return static_cast<SignedExactType>(n);
+  }
+
+  /// \brief Fast way to round float to nearest integer
+  static inline auto NumberToInt(T n) -> SignedExactType {
+    if (IsImpossibleToEncode(n)) {
+      return static_cast<SignedExactType>(Constants::kEncodingUpperLimit);
+    }
+    return FastRound(n);
+  }
+
+  /// \brief Convert a float into an int using encoding options
+  static inline SignedExactType EncodeValue(
+      const T value, const AlpExponentAndFactor exponent_and_factor) {
+    const T tmp_encoded_value = value *
+                                
Constants::GetExponent(exponent_and_factor.exponent) *
+                                
Constants::GetFactor(exponent_and_factor.factor);
+    return NumberToInt(tmp_encoded_value);
+  }
+
+  /// \brief Reconvert an int to a float using encoding options
+  static inline T DecodeValue(const SignedExactType encoded_value,
+                              const AlpExponentAndFactor exponent_and_factor) {
+    // The cast to T is needed to prevent a signed integer overflow.
+    return static_cast<T>(encoded_value) * 
GetFactor(exponent_and_factor.factor) *
+           Constants::GetFactor(exponent_and_factor.exponent);
+  }
+};
+
+/// \brief Helper struct for tracking compression combinations
+struct AlpCombination {
+  AlpExponentAndFactor exponent_and_factor;
+  uint64_t num_appearances{0};
+  uint64_t estimated_compression_size{0};
+};
+
+/// \brief Compare two ALP combinations to determine which is better
+///
+/// Return true if c1 is a better combination than c2.
+/// First criteria is number of times it appears as best combination.
+/// Second criteria is the estimated compression size.
+/// Third criteria is bigger exponent.
+/// Fourth criteria is bigger factor.
+bool CompareAlpCombinations(const AlpCombination& c1, const AlpCombination& 
c2) {
+  return (c1.num_appearances > c2.num_appearances) ||
+         (c1.num_appearances == c2.num_appearances &&
+          (c1.estimated_compression_size < c2.estimated_compression_size)) ||
+         ((c1.num_appearances == c2.num_appearances &&
+           c1.estimated_compression_size == c2.estimated_compression_size) &&
+          (c2.exponent_and_factor.exponent < c1.exponent_and_factor.exponent)) 
||
+         ((c1.num_appearances == c2.num_appearances &&
+           c1.estimated_compression_size == c2.estimated_compression_size &&
+           c2.exponent_and_factor.exponent == c1.exponent_and_factor.exponent) 
&&
+          (c2.exponent_and_factor.factor < c1.exponent_and_factor.factor));
+}
+
+}  // namespace
+
+// ----------------------------------------------------------------------
+// AlpCompression implementation
+
+template <typename T>
+std::optional<uint64_t> AlpCompression<T>::EstimateCompressedSize(
+    const std::vector<T>& input_vector,
+    const AlpExponentAndFactor exponent_and_factor,
+    const bool penalize_exceptions) {
+  // Dry compress a vector (ideally a sample) to estimate ALP compression size
+  // given an exponent and factor.
+  SignedExactType max_encoded_value = 
std::numeric_limits<SignedExactType>::min();
+  SignedExactType min_encoded_value = 
std::numeric_limits<SignedExactType>::max();
+
+  uint64_t num_exceptions = 0;
+  uint64_t num_non_exceptions = 0;
+  for (const T& value : input_vector) {

Review Comment:
   Applied the batched encode/decode/compare split to EstimateCompressedSize 
which is the hot path during sampling. EncodeVector uses the same pattern but 
also builds exception position/value lists, making a clean split harder — and 
it runs only once per vector vs. many times during  parameter selection.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-48701: [C++][Parquet] Add ALPpd encoding [arrow]

Reply via email to