This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c6aad8dbde GH-48004: [C++][Parquet] Fix hang in ColumnReader benchmark 
(#48005)
c6aad8dbde is described below

commit c6aad8dbde7294f6bbc2d25cfb4b625d8fa467fd
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Nov 10 09:02:50 2025 +0100

    GH-48004: [C++][Parquet] Fix hang in ColumnReader benchmark (#48005)
    
    ### Rationale for this change
    
    The benchmark was instantiating the `ColumnReader` with less values than it 
would later attempt to read. The `ReadBatch` method would then return 0 
prematurely and the loop would never progress.
    
    ### Are these changes tested?
    
    Manually and by continuous benchmarking.
    
    ### Are there any user-facing changes?
    
    No.
    * GitHub Issue: #48004
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/column_io_benchmark.cc | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/cpp/src/parquet/column_io_benchmark.cc 
b/cpp/src/parquet/column_io_benchmark.cc
index 695b34c76e..4b29a1284d 100644
--- a/cpp/src/parquet/column_io_benchmark.cc
+++ b/cpp/src/parquet/column_io_benchmark.cc
@@ -21,6 +21,7 @@
 #include "arrow/io/memory.h"
 #include "arrow/testing/random.h"
 #include "arrow/util/config.h"
+#include "arrow/util/logging.h"
 
 #include "parquet/column_reader.h"
 #include "parquet/column_writer.h"
@@ -167,14 +168,15 @@ std::shared_ptr<Int64Reader> 
BuildReader(std::shared_ptr<Buffer>& buffer,
 
 static void BM_ReadInt64Column(::benchmark::State& state, Repetition::type 
repetition,
                                Compression::type codec, Encoding::type 
encoding) {
-  format::ColumnChunk thrift_metadata;
+  const auto kNumValues = state.range(0);
+  const auto kBatchSize = state.range(1);
 
   ::arrow::random::RandomArrayGenerator rgen(1337);
-  auto values = rgen.Int64(state.range(0), 0, 1000000, 0);
+  auto values = rgen.Int64(kNumValues, 0, 1000000, 0);
   const auto& int64_values = static_cast<const ::arrow::Int64Array&>(*values);
 
-  std::vector<int16_t> definition_levels(state.range(0), 1);
-  std::vector<int16_t> repetition_levels(state.range(0), 0);
+  std::vector<int16_t> definition_levels(kNumValues, 1);
+  std::vector<int16_t> repetition_levels(kNumValues, 0);
   std::shared_ptr<ColumnDescriptor> schema = Int64Schema(repetition);
   std::shared_ptr<WriterProperties> properties = WriterProperties::Builder()
                                                      .compression(codec)
@@ -182,12 +184,13 @@ static void BM_ReadInt64Column(::benchmark::State& state, 
Repetition::type repet
                                                      ->disable_dictionary()
                                                      ->build();
 
+  format::ColumnChunk thrift_metadata;
   auto metadata = ColumnChunkMetaDataBuilder::Make(
       properties, schema.get(), reinterpret_cast<uint8_t*>(&thrift_metadata));
 
   auto stream = CreateOutputStream();
   std::shared_ptr<Int64Writer> writer = BuildWriter(
-      state.range(0), stream, metadata.get(), schema.get(), properties.get(), 
codec);
+      kNumValues, stream, metadata.get(), schema.get(), properties.get(), 
codec);
   writer->WriteBatch(int64_values.length(), definition_levels.data(),
                      repetition_levels.data(), int64_values.raw_values());
   writer->Close();
@@ -196,16 +199,17 @@ static void BM_ReadInt64Column(::benchmark::State& state, 
Repetition::type repet
   int64_t stream_size = src->size();
   int64_t data_size = int64_values.length() * sizeof(int64_t);
 
-  std::vector<int64_t> values_out(state.range(1));
-  std::vector<int16_t> definition_levels_out(state.range(1));
-  std::vector<int16_t> repetition_levels_out(state.range(1));
+  std::vector<int64_t> values_out(kBatchSize);
+  std::vector<int16_t> definition_levels_out(kBatchSize);
+  std::vector<int16_t> repetition_levels_out(kBatchSize);
   while (state.KeepRunning()) {
     std::shared_ptr<Int64Reader> reader =
-        BuildReader(src, state.range(1), codec, schema.get());
+        BuildReader(src, kNumValues, codec, schema.get());
     int64_t values_read = 0;
-    for (int64_t i = 0; i < int64_values.length(); i += values_read) {
-      reader->ReadBatch(values_out.size(), definition_levels_out.data(),
+    for (int64_t i = 0; i < kNumValues; i += values_read) {
+      reader->ReadBatch(kBatchSize, definition_levels_out.data(),
                         repetition_levels_out.data(), values_out.data(), 
&values_read);
+      ARROW_CHECK_NE(values_read, 0) << "Unexpected end of column";
     }
   }
   SetBytesProcessed(state, repetition);

Reply via email to