tustvold commented on code in PR #1995:
URL: https://github.com/apache/arrow-rs/pull/1995#discussion_r912420266
##########
parquet/src/column/reader.rs:
##########
@@ -205,94 +197,74 @@ where
// Read exhaustively all pages until we read all batch_size
values/levels
// or there are no more values/levels to read.
- while max(values_read, levels_read) < batch_size {
+ while levels_read < batch_size {
if !self.has_next()? {
break;
}
// Batch size for the current iteration
- let iter_batch_size = {
- // Compute approximate value based on values decoded so far
- let mut adjusted_size = min(
- batch_size,
- (self.num_buffered_values - self.num_decoded_values) as
usize,
- );
-
- // Adjust batch size by taking into account how much data there
- // to read. As batch_size is also smaller than value and level
- // slices (if available), this ensures that available space is
not
- // exceeded.
- adjusted_size = min(adjusted_size, batch_size - values_read);
- adjusted_size = min(adjusted_size, batch_size - levels_read);
-
- adjusted_size
- };
+ let iter_batch_size = (batch_size - levels_read)
+ .min((self.num_buffered_values - self.num_decoded_values) as
usize);
// If the field is required and non-repeated, there are no
definition levels
- let (num_def_levels, null_count) = match def_levels.as_mut() {
- Some(levels) if self.descr.max_def_level() > 0 => {
+ let null_count = match self.descr.max_def_level() > 0 {
+ true => {
+ let levels = def_levels
+ .as_mut()
+ .ok_or_else(|| general_err!("must specify definition
levels"))?;
+
let num_def_levels = self
.def_level_decoder
.as_mut()
.expect("def_level_decoder be set")
- .read(*levels, levels_read..levels_read +
iter_batch_size)?;
+ .read(levels, levels_read..levels_read +
iter_batch_size)?;
+
+ if num_def_levels != iter_batch_size {
+ return Err(general_err!("insufficient definition
levels read from column - expected {}, got {}", iter_batch_size,
num_def_levels));
+ }
- let null_count = levels.count_nulls(
+ levels.count_nulls(
levels_read..levels_read + num_def_levels,
self.descr.max_def_level(),
- );
- (num_def_levels, null_count)
+ )
}
- _ => (0, 0),
+ false => 0,
};
- let num_rep_levels = match rep_levels.as_mut() {
- Some(levels) if self.descr.max_rep_level() > 0 => self
+ if self.descr.max_rep_level() > 0 {
+ let levels = rep_levels
+ .as_mut()
+ .ok_or_else(|| general_err!("must specify repetition
levels"))?;
+
+ let rep_levels = self
.rep_level_decoder
.as_mut()
.expect("rep_level_decoder be set")
- .read(levels, levels_read..levels_read + iter_batch_size)?,
- _ => 0,
- };
+ .read(levels, levels_read..levels_read + iter_batch_size)?;
- // At this point we have read values, definition and repetition
levels.
- // If both definition and repetition levels are defined, their
counts
- // should be equal. Values count is always less or equal to
definition levels.
- if num_def_levels != 0
- && num_rep_levels != 0
Review Comment:
This is the source of #1997 - the value of 0 is used as a sentinel for no
levels present, which prevents detecting the case of no actual values left
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]