alamb commented on code in PR #7524:
URL: https://github.com/apache/arrow-rs/pull/7524#discussion_r2096378873
##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -808,58 +841,92 @@ impl ParquetRecordBatchReader {
fn next_inner(&mut self) -> Result<Option<RecordBatch>> {
let mut read_records = 0;
let batch_size = self.batch_size();
+
+ let mut mask_builder = BooleanBufferBuilder::new(batch_size);
+
match self.read_plan.selection_mut() {
Some(selection) => {
- while read_records < batch_size && !selection.is_empty() {
- let front = selection.pop_front().unwrap();
- if front.skip {
- let skipped =
self.array_reader.skip_records(front.row_count)?;
+ while let Some(cur_selection) =
+ take_next_selection(selection, batch_size - read_records)
+ {
+ let mut total_read = 0;
+ let mut total_skip = 0;
+ for r in cur_selection.iter() {
+ if r.skip {
+ total_skip += r.row_count;
+ } else {
+ total_read += r.row_count;
+ }
+ }
+ let select_count = cur_selection.iter().count();
+ let total = total_skip + total_read;
- if skipped != front.row_count {
- return Err(general_err!(
+ if total < 10 * select_count {
+ let mut bitmap_builder =
BooleanBufferBuilder::new(total);
Review Comment:
> 8192(5 read 10 skip 10 skip, 5 read, 5read...) => avg < 10 so nee to
change the 8192 small window to bitmap
Yeah, what I am thinking is somehow avoid creating the pattern of `5 read 10
skip 5 read 10 skip ...` in the first place.
I haven't thought through exactly how to do that
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]