Re: [PR] [Variant] Optimize the object header generation logic in ObjectBuilder::finish [arrow-rs]

via GitHub Sat, 02 Aug 2025 18:05:25 -0700


viirya commented on code in PR #8031:
URL: https://github.com/apache/arrow-rs/pull/8031#discussion_r2249471654



##########
parquet-variant/src/builder.rs:
##########
@@ -1462,49 +1441,36 @@ impl<'a> ObjectBuilder<'a> {
         let num_fields = self.fields.len();
         let is_large = num_fields > u8::MAX as usize;
 
-        let header_size = 1 + // header byte
-            (if is_large { 4 } else { 1 }) + // num_fields
-            (num_fields * id_size as usize) + // field IDs
-            ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+        let num_fileds_size = if is_large { 4 } else { 1 }; // is_large: 4 
bytes, else 1 byte.
 
-        let starting_offset = self.parent_value_offset_base;
+        let num_fields_bytes = num_fields.to_le_bytes();
+        let num_elements_bytes = 
num_fields_bytes.iter().take(num_fileds_size).copied();
 
-        // Shift existing data to make room for the header
-        let buffer = parent_buffer.inner_mut();
-        buffer.splice(
-            starting_offset..starting_offset,
-            std::iter::repeat_n(0u8, header_size),
+        let fields = PackedU32Iterator::new(
+            id_size as usize,
+            self.fields.keys().map(|offset| offset.to_le_bytes()),
+        );
+        let offsets = PackedU32Iterator::new(
+            offset_size as usize,
+            self.fields
+                .values()
+                .map(|offset| (*offset as u32).to_le_bytes()),
         );
 
-        // Write header at the original start position
-        let mut header_pos = starting_offset;
-
-        // Write header byte
+        let data_size_bytes = (data_size as u32).to_le_bytes();
+        let data_size_bytes_iter = data_size_bytes.iter().take(offset_size as 
usize).copied();
         let header = object_header(is_large, id_size, offset_size);
+        let bytess_to_splice = std::iter::once(header)
+            .chain(num_elements_bytes)
+            .chain(fields)
+            .chain(offsets)
+            .chain(data_size_bytes_iter);
+
+        let starting_offset = self.parent_value_offset_base;
+        // Shift existing data to make room for the header
+        let buffer = parent_buffer.inner_mut();
+        buffer.splice(starting_offset..starting_offset, bytess_to_splice);
 
-        header_pos = self
-            .parent_state
-            .buffer()
-            .append_header_start_from_buf_pos(header_pos, header, is_large, 
num_fields);
-
-        header_pos = self
-            .parent_state
-            .buffer()
-            .append_offset_array_start_from_buf_pos(

Review Comment:
   Not related to this PR, but `append_offset_array_start_from_buf_pos` also 
writes field ids instead of just offsets.



##########
parquet-variant/src/builder.rs:
##########
@@ -1462,49 +1441,36 @@ impl<'a> ObjectBuilder<'a> {
         let num_fields = self.fields.len();
         let is_large = num_fields > u8::MAX as usize;
 
-        let header_size = 1 + // header byte
-            (if is_large { 4 } else { 1 }) + // num_fields
-            (num_fields * id_size as usize) + // field IDs
-            ((num_fields + 1) * offset_size as usize); // field offsets + 
data_size
+        let num_fileds_size = if is_large { 4 } else { 1 }; // is_large: 4 
bytes, else 1 byte.
 
-        let starting_offset = self.parent_value_offset_base;
+        let num_fields_bytes = num_fields.to_le_bytes();
+        let num_elements_bytes = 
num_fields_bytes.iter().take(num_fileds_size).copied();
 
-        // Shift existing data to make room for the header
-        let buffer = parent_buffer.inner_mut();
-        buffer.splice(
-            starting_offset..starting_offset,
-            std::iter::repeat_n(0u8, header_size),
+        let fields = PackedU32Iterator::new(

Review Comment:
   Honestly that the current approach looks more easy to understand at the 
first glance. This `PackedU32Iterator` approach is a bit over-abstraction to 
me. For appending the bytes into the buffer, I'd prefer to keep it simple 
instead of introducing a new abstraction on it if not too much benefits.



##########
parquet-variant/src/builder.rs:
##########
@@ -64,19 +64,55 @@ fn write_offset(buf: &mut Vec<u8>, value: usize, nbytes: 
u8) {
     buf.extend_from_slice(&bytes[..nbytes as usize]);
 }
 
-/// Write little-endian integer to buffer at a specific position
-fn write_offset_at_pos(buf: &mut [u8], start_pos: usize, value: usize, nbytes: 
u8) {
-    let bytes = value.to_le_bytes();
-    buf[start_pos..start_pos + nbytes as 
usize].copy_from_slice(&bytes[..nbytes as usize]);
-}
-
 /// Append `value_size` bytes of given `value` into `dest`.
 fn append_packed_u32(dest: &mut Vec<u8>, value: u32, value_size: usize) {
     let n = dest.len() + value_size;
     dest.extend(value.to_le_bytes());
     dest.truncate(n);
 }
 
+/// An iterator that yields the bytes of a packed u32 iterator.
+/// Will yield the first `packed_bytes` bytes of each item in the iterator.
+struct PackedU32Iterator<T: Iterator<Item = [u8; 4]>> {
+    packed_bytes: usize,
+    iterator: T,
+    current_item: [u8; 4],
+    current_byte: usize, // 0..3
+}
+
+impl<T: Iterator<Item = [u8; 4]>> PackedU32Iterator<T> {
+    fn new(packed_bytes: usize, iterator: T) -> Self {

Review Comment:
   It's better to add an assert on `packed_bytes`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] [Variant] Optimize the object header generation logic in ObjectBuilder::finish [arrow-rs]

Reply via email to