tustvold commented on code in PR #2044: URL: https://github.com/apache/arrow-rs/pull/2044#discussion_r919102261
########## parquet/src/file/serialized_reader.rs: ########## @@ -550,22 +595,29 @@ impl<T: Read + Send> Iterator for SerializedPageReader<T> { impl<T: Read + Send> PageReader for SerializedPageReader<T> { fn get_next_page(&mut self) -> Result<Option<Page>> { + let mut cursor = &mut self.buf; + let mut dictionary_cursor; while self.seen_num_values < self.total_num_values { - // For now we can not update `seen_num_values` in `skip_next_page`, - // so we need add this check. if let Some(indexes) = &self.page_offset_index { - if indexes.len() == self.seen_num_data_pages { + // For now we can not update `seen_num_values` in `skip_next_page`, + // so we need add this check. + if indexes.len() <= self.seen_num_data_pages { return Ok(None); + } else if self.seen_num_data_pages == 0 + && self.has_dictionary_page_to_read + { + dictionary_cursor = self.page_bufs.pop_front().unwrap(); + cursor = dictionary_cursor.borrow_mut(); Review Comment: ```suggestion cursor = &mut dictionary_cursor; ``` ########## parquet/src/util/page_util.rs: ########## @@ -15,13 +15,40 @@ // specific language governing permissions and limitations // under the License. +use std::collections::VecDeque; +use std::io::Read; +use std::sync::Arc; use crate::errors::Result; use parquet_format::PageLocation; +use crate::file::reader::ChunkReader; +/// Use column chunk's offset index to get the `page_num` page row count. pub(crate) fn calculate_row_count(indexes: &[PageLocation], page_num: usize, total_row_count: i64) -> Result<usize> { if page_num == indexes.len() - 1 { Ok((total_row_count - indexes[page_num].first_row_index + 1) as usize) } else { Ok((indexes[page_num + 1].first_row_index - indexes[page_num].first_row_index) as usize) } } + +/// Use column chunk's offset index to get each page serially readable slice +/// and a flag indicates whether having one dictionary page in this column chunk. +pub(crate) fn get_pages_readable_slices<T: Read + Send, R: ChunkReader + ChunkReader<T=T>>(col_chunk_offset_index: &[PageLocation], col_start: u64, chunk_reader: Arc<R>) -> Result<(VecDeque<T>, bool)> { Review Comment: ```suggestion pub(crate) fn get_pages_readable_slices<T: Read + Send, R: ChunkReader<T=T>>(col_chunk_offset_index: &[PageLocation], col_start: u64, chunk_reader: Arc<R>) -> Result<(VecDeque<T>, bool)> { ``` ########## parquet/src/file/serialized_reader.rs: ########## @@ -526,6 +543,34 @@ impl<T: Read> SerializedPageReader<T> { page_offset_index: None, seen_num_data_pages: 0, has_dictionary_page_to_read: false, + page_bufs: Default::default(), + }; + Ok(result) + } + + /// Creates a new serialized page reader from file source. + pub fn new_with_page_offsets( + buf: T, Review Comment: It would be nice to not need to pass this, as it isn't actually used ########## parquet/src/file/serialized_reader.rs: ########## @@ -498,9 +566,23 @@ impl<T: Read> SerializedPageReader<T> { seen_num_values: 0, decompressor, physical_type, + num_rows, + page_offset_index: Some(offset_index), + seen_num_data_pages: 0, + has_dictionary_page_to_read, + page_bufs, }; Ok(result) } + + pub fn with_page_offset_index_and_has_dictionary_to_read( Review Comment: I would remove this method now in favor of new_with_page_offsets ########## parquet/src/file/serialized_reader.rs: ########## @@ -481,6 +505,22 @@ pub struct SerializedPageReader<T: Read> { // Column chunk type. physical_type: Type, + + // total row count. + num_rows: i64, + + //Page offset index. Review Comment: Perhaps we could have something like ``` enum SerializedPages<T: Read> /// Read entire chunk Chunk { buf: T, total_num_values: i64, }, Pages { index: Vec<PageLocation>, seen_num_data_pages: usize, has_dictionary_page_to_read: bool, page_bufs: VecDeque<T> } ``` To make it clear what is present in what mode ########## parquet/src/file/serialized_reader.rs: ########## @@ -481,6 +505,22 @@ pub struct SerializedPageReader<T: Read> { // Column chunk type. physical_type: Type, + + // total row count. + num_rows: i64, Review Comment: This doesn't appear to be being used, and would avoid this being a breaking change ########## parquet/src/file/serialized_reader.rs: ########## @@ -526,6 +543,34 @@ impl<T: Read> SerializedPageReader<T> { page_offset_index: None, seen_num_data_pages: 0, has_dictionary_page_to_read: false, + page_bufs: Default::default(), + }; + Ok(result) + } + + /// Creates a new serialized page reader from file source. + pub fn new_with_page_offsets( + buf: T, + total_num_values: i64, Review Comment: I don't think we can actually respect this when skipping pages, so not sure we need to provide it -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org