alamb commented on code in PR #5179:
URL: https://github.com/apache/arrow-rs/pull/5179#discussion_r1420850059
##########
arrow-ipc/src/reader.rs:
##########
@@ -535,45 +559,34 @@ pub struct FileReader<R: Read + Seek> {
impl<R: Read + Seek> fmt::Debug for FileReader<R> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(),
fmt::Error> {
f.debug_struct("FileReader<R>")
- .field("reader", &"BufReader<..>")
.field("schema", &self.schema)
.field("blocks", &self.blocks)
.field("current_block", &self.current_block)
.field("total_blocks", &self.total_blocks)
.field("dictionaries_by_id", &self.dictionaries_by_id)
.field("metadata_version", &self.metadata_version)
.field("projection", &self.projection)
- .finish()
+ .finish_non_exhaustive()
}
}
impl<R: Read + Seek> FileReader<R> {
/// Try to create a new file reader
///
- /// Returns errors if the file does not meet the Arrow Format header and
footer
- /// requirements
- pub fn try_new(reader: R, projection: Option<Vec<usize>>) -> Result<Self,
ArrowError> {
- let mut reader = BufReader::new(reader);
- // check if header and footer contain correct magic bytes
- let mut magic_buffer: [u8; 6] = [0; 6];
- reader.read_exact(&mut magic_buffer)?;
- if magic_buffer != super::ARROW_MAGIC {
- return Err(ArrowError::ParseError(
- "Arrow file does not contain correct header".to_string(),
- ));
- }
- reader.seek(SeekFrom::End(-6))?;
- reader.read_exact(&mut magic_buffer)?;
- if magic_buffer != super::ARROW_MAGIC {
+ /// Returns errors if the file does not meet the Arrow Format footer
requirements
+ pub fn try_new(mut reader: R, projection: Option<Vec<usize>>) ->
Result<Self, ArrowError> {
+ let mut buffer = [0; 10];
Review Comment:
```suggestion
// Space for ARROW_MAGIC (6 bytes) and length (4 bytes)
let mut buffer = [0; 10];
```
##########
arrow-ipc/src/reader.rs:
##########
@@ -607,35 +620,14 @@ impl<R: Read + Seek> FileReader<R> {
let mut dictionaries_by_id = HashMap::new();
if let Some(dictionaries) = footer.dictionaries() {
for block in dictionaries {
- // read length from end of offset
- let mut message_size: [u8; 4] = [0; 4];
- reader.seek(SeekFrom::Start(block.offset() as u64))?;
- reader.read_exact(&mut message_size)?;
- if message_size == CONTINUATION_MARKER {
- reader.read_exact(&mut message_size)?;
- }
- let footer_len = i32::from_le_bytes(message_size);
- let mut block_data = vec![0; footer_len as usize];
-
- reader.read_exact(&mut block_data)?;
-
- let message =
crate::root_as_message(&block_data[..]).map_err(|err| {
- ArrowError::ParseError(format!("Unable to get root as
message: {err:?}"))
- })?;
+ let buf = read_block(&mut reader, block)?;
Review Comment:
reducing the number of copies of the code that does the `read` seems like
a good improvement to me
It seems like previously the code did 2 IOs per block (at least) to read the
header and then the data. With this PR it will only do 1 IO a block (given
there is no continuation)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]