pchintar commented on code in PR #9778:
URL: https://github.com/apache/arrow-rs/pull/9778#discussion_r3133816560


##########
arrow-ipc/src/reader.rs:
##########
@@ -868,16 +868,47 @@ fn get_dictionary_values(
     Ok(dictionary_values)
 }
 
-/// Read the data for a given block
+/// Reads the full data block (metadata + body) from the underlying reader.
+///
+/// Uses a zero-initialized buffer for small blocks. For larger blocks, reads
+/// into a temporary `Vec<u8>` and reuses the allocation when it is 64-byte
+/// aligned, matching Arrow's `ALIGNMENT` for `MutableBuffer`. Otherwise, it
+/// falls back to copying into an Arrow-aligned buffer.
+///
+/// This reduces redundant zero-initialization on large reads while preserving
+/// the alignment expected by Arrow buffers.
 fn read_block<R: Read + Seek>(mut reader: R, block: &Block) -> Result<Buffer, 
ArrowError> {
     reader.seek(SeekFrom::Start(block.offset() as u64))?;
     let body_len = block.bodyLength().to_usize().unwrap();
     let metadata_len = block.metaDataLength().to_usize().unwrap();
     let total_len = body_len.checked_add(metadata_len).unwrap();
 
-    let mut buf = MutableBuffer::from_len_zeroed(total_len);
-    reader.read_exact(&mut buf)?;
-    Ok(buf.into())
+    if total_len < 8 * 1024 {
+        let mut buf = MutableBuffer::from_len_zeroed(total_len);
+        reader.read_exact(&mut buf)?;
+        return Ok(buf.into());
+    }
+
+    let mut vec = Vec::with_capacity(total_len);
+    reader
+        .by_ref()
+        .take(total_len as u64)
+        .read_to_end(&mut vec)?;
+
+    if vec.len() != total_len {
+        return Err(ArrowError::IpcError(format!(
+            "Expected IPC block of length {total_len}, got {}",
+            vec.len()
+        )));
+    }
+
+    if ((vec.as_ptr() as usize) & 63) == 0 {
+        Ok(Buffer::from_vec(vec))
+    } else {
+        let mut buf = MutableBuffer::from_len_zeroed(total_len);

Review Comment:
   Thanks @alamb — this is very helpful.
   
   My local results were positive, but the CI benchmark clearly shows the 
current hybrid approach regresses on this machine, so I simplified this and 
used the always-`Vec' approach now. i got these results below: 
   
   <img width="2070" height="1606" alt="Image 4-23-26 at 5 06 PM" 
src="https://github.com/user-attachments/assets/e56e2fc2-7c1f-4721-b76c-22ad93343f60";
 />
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to