This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch 53.0.0_maintenance
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/53.0.0_maintenance by this 
push:
     new 955180b5cd fix: Encoding of List offsets was incorrect when slice 
offsets begin with zero (#6805) (#6943)
955180b5cd is described below

commit 955180b5cd451c83343e44bc2c510bf377ef2c80
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jan 8 17:25:29 2025 -0500

    fix: Encoding of List offsets was incorrect when slice offsets begin with 
zero (#6805) (#6943)
    
    * fix: Encoding of List offsets was incorrect when slice offsets begin with 
zero
    
    When encoding offsets the code had an optimization to reuse the offsets if 
the first offset was zero assuming the slice already pointed
     to first element. But the offset can also be zero if all previous lists 
were empty. When this occured it mold make all lists in the
    slice as empty, even if they shouldn't be.
    
    * Use Buffer::from_slice_ref which will be faster as it doesn't iterate 
through the slice.
    
    * Avoid copying
    
    * Explicitly reference std::mem::size_of
    
    Co-authored-by: Michael Maletich <[email protected]>
---
 arrow-ipc/src/writer.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index e6fc9d81df..05a6066420 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -23,6 +23,7 @@
 use std::cmp::min;
 use std::collections::HashMap;
 use std::io::{BufWriter, Write};
+use std::mem::size_of;
 use std::sync::Arc;
 
 use flatbuffers::FlatBufferBuilder;
@@ -1430,7 +1431,13 @@ fn reencode_offsets<O: OffsetSizeTrait>(
     let end_offset = offset_slice.last().unwrap();
 
     let offsets = match start_offset.as_usize() {
-        0 => offsets.clone(),
+        0 => {
+            let size = size_of::<O>();
+            offsets.slice_with_length(
+                data.offset() * size,
+                (data.offset() + data.len() + 1) * size,
+            )
+        }
         _ => offset_slice.iter().map(|x| *x - *start_offset).collect(),
     };
 
@@ -2517,6 +2524,36 @@ mod tests {
         ls.finish()
     }
 
+    fn generate_nested_list_data_starting_at_zero<O: OffsetSizeTrait>() -> 
GenericListArray<O> {
+        let mut ls =
+            GenericListBuilder::<O, _>::new(GenericListBuilder::<O, 
_>::new(UInt32Builder::new()));
+
+        for _i in 0..999 {
+            ls.values().append(true);
+            ls.append(true);
+        }
+
+        for j in 0..10 {
+            for value in [j, j, j, j] {
+                ls.values().values().append_value(value);
+            }
+            ls.values().append(true)
+        }
+        ls.append(true);
+
+        for i in 0..9_000 {
+            for j in 0..10 {
+                for value in [i + j, i + j, i + j, i + j] {
+                    ls.values().values().append_value(value);
+                }
+                ls.values().append(true)
+            }
+            ls.append(true);
+        }
+
+        ls.finish()
+    }
+
     fn generate_map_array_data() -> MapArray {
         let keys_builder = UInt32Builder::new();
         let values_builder = UInt32Builder::new();
@@ -2608,6 +2645,19 @@ mod tests {
         roundtrip_ensure_sliced_smaller(in_batch, 1000);
     }
 
+    #[test]
+    fn encode_nested_lists_starting_at_zero() {
+        let inner_int = Arc::new(Field::new("item", DataType::UInt32, true));
+        let inner_list_field = Arc::new(Field::new("item", 
DataType::List(inner_int), true));
+        let list_field = Field::new("val", DataType::List(inner_list_field), 
true);
+        let schema = Arc::new(Schema::new(vec![list_field]));
+
+        let values = 
Arc::new(generate_nested_list_data_starting_at_zero::<i32>());
+
+        let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap();
+        roundtrip_ensure_sliced_smaller(in_batch, 1);
+    }
+
     #[test]
     fn encode_map_array() {
         let keys = Arc::new(Field::new("keys", DataType::UInt32, false));

Reply via email to