alamb commented on code in PR #9766:
URL: https://github.com/apache/arrow-rs/pull/9766#discussion_r3135525174
##########
parquet/src/arrow/arrow_reader/read_plan.rs:
##########
@@ -25,10 +25,57 @@ use crate::arrow::arrow_reader::{
ArrowPredicate, ParquetRecordBatchReader, RowSelection,
RowSelectionCursor, RowSelector,
};
use crate::errors::{ParquetError, Result};
-use arrow_array::Array;
+use arrow_array::{Array, BooleanArray};
+use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder};
use arrow_select::filter::prep_null_mask_filter;
use std::collections::VecDeque;
+/// Options for [`ReadPlanBuilder::with_predicate_options`].
+pub struct PredicateOptions<'a> {
+ array_reader: Box<dyn ArrayReader>,
+ predicate: &'a mut dyn ArrowPredicate,
+ limit: Option<usize>,
+ total_rows: usize,
+}
+
+impl<'a> PredicateOptions<'a> {
+ /// Create options for evaluating `predicate` against rows produced by
+ /// `array_reader`.
+ ///
+ /// By default there is no match-count limit; the predicate is evaluated
+ /// over every row the reader yields. Use [`Self::with_limit`] to enable
+ /// early termination.
+ pub fn new(array_reader: Box<dyn ArrayReader>, predicate: &'a mut dyn
ArrowPredicate) -> Self {
+ Self {
+ array_reader,
+ predicate,
+ limit: None,
+ total_rows: 0,
+ }
+ }
+
+ /// Stop scanning `array_reader` once `limit` matches have accumulated.
+ ///
+ /// Performance optimization for `LIMIT` / TopK: when the cumulative
+ /// `true_count` reaches `limit`, the current filter batch is truncated
+ /// at the `limit`-th match and remaining batches are never decoded.
+ ///
+ /// `limit` counts predicate matches, not output rows — callers applying
+ /// an offset must pass `offset + limit`.
+ ///
+ /// `total_rows` is the row count `array_reader` would yield if iterated
+ /// to completion. It is used to pad un-evaluated trailing rows as "not
+ /// selected" so the returned [`RowSelection`] covers the full row group.
+ ///
+ /// Only valid for the *last* predicate in a filter chain: intermediate
+ /// predicates' match counts do not map 1:1 to output rows.
+ pub fn with_limit(mut self, limit: usize, total_rows: usize) -> Self {
Review Comment:
this is nice API that makes it clear limit and total rows is required
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]