Re: [PR] GH-47030: [C++][Parquet] Add setting to limit the number of rows written per page [arrow]

via GitHub Sat, 18 Oct 2025 03:16:28 -0700


pitrou commented on code in PR #47090:
URL: https://github.com/apache/arrow/pull/47090#discussion_r2416949650



##########
cpp/src/parquet/column_writer_test.cc:
##########
@@ -506,6 +509,43 @@ void 
TestPrimitiveWriter<FLBAType>::ReadColumnFully(Compression::type compressio
   this->SyncValuesOut();
 }
 
+template <>
+void TestPrimitiveWriter<ByteArrayType>::ReadColumnFully(Compression::type 
compression,
+                                                         bool 
page_checksum_verify) {
+  int64_t total_values = static_cast<int64_t>(this->values_out_.size());
+  BuildReader(total_values, compression, page_checksum_verify);
+  this->data_buffer_.clear();
+
+  values_read_ = 0;
+  while (values_read_ < total_values) {
+    int64_t values_read_recently = 0;
+    reader_->ReadBatch(total_values - values_read_,
+                       definition_levels_out_.data() + values_read_,
+                       repetition_levels_out_.data() + values_read_,
+                       this->values_out_ptr_ + values_read_, 
&values_read_recently);

Review Comment:
   I'm curious, is it possible for `values_read_recently` to be smaller than 
`total_values - values_read_`?



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -1150,61 +1150,62 @@ void ColumnWriterImpl::FlushBufferedDataPages() {
 // ----------------------------------------------------------------------
 // TypedColumnWriter
 
-template <typename Action>
-inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) {
-  int64_t num_batches = static_cast<int>(total / batch_size);
-  for (int round = 0; round < num_batches; round++) {
-    action(round * batch_size, batch_size, /*check_page_size=*/true);
-  }
-  // Write the remaining values
-  if (total % batch_size > 0) {
-    action(num_batches * batch_size, total % batch_size, 
/*check_page_size=*/true);
-  }
-}
-
-template <typename Action>
+template <typename Action, typename GetBufferedRows>
 inline void DoInBatches(const int16_t* def_levels, const int16_t* rep_levels,
                         int64_t num_levels, int64_t batch_size, Action&& 
action,
-                        bool pages_change_on_record_boundaries) {
-  if (!pages_change_on_record_boundaries || !rep_levels) {
-    // If rep_levels is null, then we are writing a non-repeated column.
-    // In this case, every record contains only one level.
-    return DoInBatches(num_levels, batch_size, std::forward<Action>(action));
-  }
-
+                        bool pages_change_on_record_boundaries, int64_t 
max_rows_per_page,
+                        GetBufferedRows&& curr_page_buffered_rows) {
   int64_t offset = 0;
   while (offset < num_levels) {
-    int64_t end_offset = std::min(offset + batch_size, num_levels);
+    int64_t min_batch_size = std::min(batch_size, num_levels - offset);
+    int64_t end_offset = num_levels;
+    int64_t check_page_limit_end_offset = -1;
+
+    int64_t page_buffered_rows = curr_page_buffered_rows();
+    ARROW_DCHECK_LE(page_buffered_rows, max_rows_per_page);
 
-    // Find next record boundary (i.e. rep_level = 0)
-    while (end_offset < num_levels && rep_levels[end_offset] != 0) {
-      end_offset++;
+    if (!rep_levels) {
+      min_batch_size = std::min(min_batch_size, max_rows_per_page - 
page_buffered_rows);
+      end_offset = offset + min_batch_size;
+      check_page_limit_end_offset = end_offset;
+    } else {
+      int64_t last_record_begin_offset = -1;
+      // Iterate rep_levels to find the shortest sequence that ends before a 
record
+      // boundary (i.e. rep_levels == 0) with a size no less than 
min_batch_size
+      for (int64_t i = offset; i < num_levels; ++i) {
+        if (rep_levels[i] == 0) {
+          last_record_begin_offset = i;
+          if (i - offset >= min_batch_size || page_buffered_rows >= 
max_rows_per_page) {
+            end_offset = i;
+            break;
+          }
+          page_buffered_rows += 1;
+        }
+      }
+      // Use the beginning of last record to check page limit.
+      check_page_limit_end_offset = last_record_begin_offset;
     }
 
+    ARROW_DCHECK_LE(offset, end_offset);
+    ARROW_DCHECK_LE(check_page_limit_end_offset, end_offset);
+
     if (end_offset < num_levels) {
       // This is not the last chunk of batch and end_offset is a record 
boundary.
-      // It is a good chance to check the page size.
-      action(offset, end_offset - offset, /*check_page_size=*/true);
+      // It is a good chance to check the page limit.
+      action(offset, end_offset - offset, /*check_page_limit=*/true);
     } else {
-      DCHECK_EQ(end_offset, num_levels);
-      // This is the last chunk of batch, and we do not know whether 
end_offset is a
-      // record boundary. Find the offset to beginning of last record in this 
chunk,
-      // so we can check page size.
-      int64_t last_record_begin_offset = num_levels - 1;
-      while (last_record_begin_offset >= offset &&
-             rep_levels[last_record_begin_offset] != 0) {
-        last_record_begin_offset--;
+      ARROW_DCHECK_EQ(end_offset, num_levels);
+      if (offset <= check_page_limit_end_offset) {
+        action(offset, check_page_limit_end_offset - offset, 
/*check_page_limit=*/true);
+        offset = check_page_limit_end_offset;
       }
-
-      if (offset <= last_record_begin_offset) {
-        // We have found the beginning of last record and can check page size.
-        action(offset, last_record_begin_offset - offset, 
/*check_page_size=*/true);
-        offset = last_record_begin_offset;
+      if (offset < end_offset) {
+        // This is the last chunk of batch, and we do not know whether 
end_offset is a
+        // record boundary so we cannot check page limit if pages cannot 
change on
+        // record boundaries.
+        action(offset, end_offset - offset,
+               /*check_page_limit=*/!pages_change_on_record_boundaries);

Review Comment:
   Yes, I'd rather we separate the two cases as it will make the code more 
easily reviewed.



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -1150,61 +1150,62 @@ void ColumnWriterImpl::FlushBufferedDataPages() {
 // ----------------------------------------------------------------------
 // TypedColumnWriter
 
-template <typename Action>
-inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) {
-  int64_t num_batches = static_cast<int>(total / batch_size);
-  for (int round = 0; round < num_batches; round++) {
-    action(round * batch_size, batch_size, /*check_page_size=*/true);
-  }
-  // Write the remaining values
-  if (total % batch_size > 0) {
-    action(num_batches * batch_size, total % batch_size, 
/*check_page_size=*/true);
-  }
-}
-
-template <typename Action>
+template <typename Action, typename GetBufferedRows>
 inline void DoInBatches(const int16_t* def_levels, const int16_t* rep_levels,
-                        int64_t num_levels, int64_t batch_size, Action&& 
action,
-                        bool pages_change_on_record_boundaries) {
-  if (!pages_change_on_record_boundaries || !rep_levels) {
-    // If rep_levels is null, then we are writing a non-repeated column.
-    // In this case, every record contains only one level.
-    return DoInBatches(num_levels, batch_size, std::forward<Action>(action));
-  }
-
+                        int64_t num_levels, int64_t batch_size, int64_t 
max_rows_per_page,
+                        bool pages_change_on_record_boundaries, Action&& 
action,
+                        GetBufferedRows&& curr_page_buffered_rows) {
   int64_t offset = 0;
   while (offset < num_levels) {
-    int64_t end_offset = std::min(offset + batch_size, num_levels);
-
-    // Find next record boundary (i.e. rep_level = 0)
-    while (end_offset < num_levels && rep_levels[end_offset] != 0) {
-      end_offset++;
+    int64_t min_batch_size = std::min(batch_size, num_levels - offset);

Review Comment:
   Should we call this `max_batch_size` instead? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-47030: [C++][Parquet] Add setting to limit the number of rows written per page [arrow]

Reply via email to