alamb commented on code in PR #8979:
URL: https://github.com/apache/arrow-rs/pull/8979#discussion_r2607677497
##########
arrow-ipc/src/writer.rs:
##########
@@ -1705,6 +1705,24 @@ fn get_list_array_buffers<O: OffsetSizeTrait>(data:
&ArrayData) -> (Buffer, Arra
(offsets, child_data)
}
+/// Returns the sliced views [`Buffer`] for a BinaryView/Utf8View array.
Review Comment:
I think this is doing something very similar to this code:
https://github.com/apache/arrow-rs/blob/03d3b109244c5c533481d387b8320ff3d97fb797/arrow-ipc/src/writer.rs#L1842-L1850
Thus I recommend we factor the existing (well tested) logic into a new
function and call that method instead of making one that is specific to views.
##########
arrow-ipc/src/writer.rs:
##########
@@ -3074,6 +3138,52 @@ mod tests {
roundtrip_ensure_sliced_smaller(in_batch, 1000);
}
+ #[test]
+ fn encode_large_lists_non_zero_offset() {
+ let val_inner = Field::new_list_field(DataType::UInt32, true);
+ let val_list_field = Field::new("val",
DataType::LargeList(Arc::new(val_inner)), false);
+ let schema = Arc::new(Schema::new(vec![val_list_field]));
+
+ let values = Arc::new(generate_list_data::<i64>());
+
+ let in_batch = RecordBatch::try_new(schema, vec![values])
+ .unwrap()
+ .slice(999, 1);
+ let out_batch = deserialize_file(serialize_file(&in_batch));
+ assert_eq!(in_batch, out_batch);
+ }
+
+ #[test]
+ fn encode_large_lists_string_non_zero_offset() {
+ let val_inner = Field::new_list_field(DataType::Utf8, true);
+ let val_list_field = Field::new("val",
DataType::LargeList(Arc::new(val_inner)), false);
+ let schema = Arc::new(Schema::new(vec![val_list_field]));
+
+ let values = Arc::new(generate_string_list_data::<i64>());
+
+ let in_batch = RecordBatch::try_new(schema, vec![values])
+ .unwrap()
+ .slice(999, 1);
+ let out_batch = deserialize_file(serialize_file(&in_batch));
+ assert_eq!(in_batch, out_batch);
+ }
+
+ #[test]
+ fn encode_large_list_string_view_non_zero_offset() {
+ let val_inner = Field::new_list_field(DataType::Utf8View, true);
+ let val_list_field = Field::new("val",
DataType::LargeList(Arc::new(val_inner)), false);
+ let schema = Arc::new(Schema::new(vec![val_list_field]));
+
+ let values = Arc::new(generate_utf8view_list_data::<i64>());
+
+ let in_batch = RecordBatch::try_new(schema, vec![values])
Review Comment:
Can you also try slicing a few other ranges too (like 0..11 and 23..50) to
ensure the length handling is correct?
##########
arrow-ipc/src/writer.rs:
##########
@@ -1772,7 +1790,18 @@ fn write_array_data(
// Current implementation just serialize the raw arrays as given and
not try to optimize anything.
// If users wants to "compact" the arrays prior to sending them over
IPC,
// they should consider the gc API suggested in #5513
- for buffer in array_data.buffers() {
+ let views = get_view_buffer(array_data);
Review Comment:
it is sort of embarrassing that the comment even says "Slicing the views
buffer is safe and easy," but then the code doesn't do it 🤦
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]