tustvold commented on code in PR #2999:
URL: https://github.com/apache/arrow-rs/pull/2999#discussion_r1014596963
##########
parquet/src/column/reader/decoder.rs:
##########
@@ -323,41 +368,153 @@ impl DefinitionLevelDecoder for ColumnLevelDecoderImpl {
) -> Result<(usize, usize)> {
let mut level_skip = 0;
let mut value_skip = 0;
- match self.decoder.as_mut().unwrap() {
- LevelDecoderInner::Packed(reader, bit_width) => {
- for _ in 0..num_levels {
- // Values are delimited by max_def_level
- if max_def_level
- == reader
- .get_value::<i16>(*bit_width as usize)
- .expect("Not enough values in Packed
ColumnLevelDecoderImpl.")
- {
- value_skip += 1;
- }
- level_skip += 1;
- }
- }
- LevelDecoderInner::Rle(reader) => {
- for _ in 0..num_levels {
- if let Some(level) = reader
- .get::<i16>()
- .expect("Not enough values in Rle
ColumnLevelDecoderImpl.")
- {
- // Values are delimited by max_def_level
- if level == max_def_level {
- value_skip += 1;
- }
- }
- level_skip += 1;
+ while level_skip < num_levels {
+ let remaining_levels = num_levels - level_skip;
+
+ if self.buffer.is_empty() {
+ // Only read number of needed values
+ self.read_to_buffer(remaining_levels.min(SKIP_BUFFER_SIZE))?;
+ if self.buffer.is_empty() {
+ // Reached end of page
+ break;
}
}
+ let to_read = self.buffer.len().min(remaining_levels);
+
+ level_skip += to_read;
+ value_skip += self.buffer[..to_read]
+ .iter()
+ .filter(|x| **x == max_def_level)
+ .count();
+
+ self.split_off_buffer(to_read)
}
+
Ok((value_skip, level_skip))
}
}
impl RepetitionLevelDecoder for ColumnLevelDecoderImpl {
- fn skip_rep_levels(&mut self, _num_records: usize) -> Result<(usize,
usize)> {
- Err(nyi_err!("https://github.com/apache/arrow-rs/issues/1792"))
+ fn skip_rep_levels(&mut self, num_records: usize) -> Result<(usize,
usize)> {
+ let mut level_skip = 0;
+ let mut record_skip = 0;
+
+ loop {
+ if self.buffer.is_empty() {
+ // Read SKIP_BUFFER_SIZE as we don't know how many to read
+ self.read_to_buffer(SKIP_BUFFER_SIZE)?;
+ if self.buffer.is_empty() {
+ // Reached end of page
+ break;
+ }
+ }
+
+ let mut to_skip = 0;
+ while to_skip < self.buffer.len() && record_skip != num_records {
+ if self.buffer[to_skip] == 0 {
+ record_skip += 1;
+ }
+ to_skip += 1;
+ }
+
+ // Find end of record
+ while to_skip < self.buffer.len() && self.buffer[to_skip] != 0 {
+ to_skip += 1;
+ }
+
+ level_skip += to_skip;
+ if to_skip >= self.buffer.len() {
Review Comment:
Yeah, the hope is that >= helps LLVM elide bound checks that follow. Not
sure it makes any difference, just habit 😅
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]