Ted-Jiang commented on code in PR #2464:
URL: https://github.com/apache/arrow-rs/pull/2464#discussion_r947417837
##########
parquet/src/file/serialized_reader.rs:
##########
@@ -471,234 +480,232 @@ pub(crate) fn decode_page(
Ok(result)
}
-enum SerializedPages<T: Read> {
- /// Read entire chunk
- Chunk { buf: T },
- /// Read operate pages which can skip.
+enum SerializedPageReaderState {
+ Values {
+ /// The current byte offset in the reader
+ offset: usize,
+
+ /// The length of the chunk in bytes
+ remaining_bytes: usize,
+ },
Pages {
- offset_index: Vec<PageLocation>,
- seen_num_data_pages: usize,
- has_dictionary_page_to_read: bool,
- page_bufs: VecDeque<T>,
+ /// Remaining page locations
+ page_locations: VecDeque<PageLocation>,
+ /// Remaining dictionary location if any
+ dictionary_page: Option<PageLocation>,
+ /// The total number of rows in this column chunk
+ total_rows: usize,
},
}
/// A serialized implementation for Parquet [`PageReader`].
-pub struct SerializedPageReader<T: Read> {
- // The file source buffer which references exactly the bytes for the
column trunk
- // to be read by this page reader.
- buf: SerializedPages<T>,
+pub struct SerializedPageReader<R: ChunkReader> {
+ /// The chunk reader
+ reader: Arc<R>,
- // The compression codec for this column chunk. Only set for non-PLAIN
codec.
+ /// The compression codec for this column chunk. Only set for non-PLAIN
codec.
decompressor: Option<Box<dyn Codec>>,
- // The number of values we have seen so far.
- seen_num_values: i64,
-
- // The number of total values in this column chunk.
- total_num_values: i64,
-
- // Column chunk type.
+ /// Column chunk type.
physical_type: Type,
+
+ state: SerializedPageReaderState,
}
-impl<T: Read> SerializedPageReader<T> {
- /// Creates a new serialized page reader from file source.
+impl<R: ChunkReader> SerializedPageReader<R> {
+ /// Creates a new serialized page reader from a chunk reader and metadata
pub fn new(
- buf: T,
- total_num_values: i64,
- compression: Compression,
- physical_type: Type,
+ reader: Arc<R>,
+ meta: &ColumnChunkMetaData,
+ total_rows: usize,
+ page_locations: Option<Vec<PageLocation>>,
) -> Result<Self> {
- let decompressor = create_codec(compression)?;
- let result = Self {
- buf: SerializedPages::Chunk { buf },
- total_num_values,
- seen_num_values: 0,
- decompressor,
- physical_type,
- };
- Ok(result)
- }
+ let decompressor = create_codec(meta.compression())?;
+ let (start, len) = meta.byte_range();
Review Comment:
maybe this is more readable
```suggestion
let start = meta.data_page_offset();
let len = meta.compressed_size();
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]