Re: [PR] Prevent FlightData overflowing max size limit whenever possible. [arrow-rs]

via GitHub Fri, 08 Nov 2024 08:07:44 -0800


alamb commented on code in PR #6690:
URL: https://github.com/apache/arrow-rs/pull/6690#discussion_r1834632924



##########
arrow-ipc/src/writer.rs:
##########
@@ -448,15 +454,45 @@ impl IpcDataGenerator {
         Ok(())
     }
 
+    /// Calls [`Self::encoded_batch_with_size`] with no limit, returning the 
first (and only)
+    /// [`EncodedData`] that is produced. This method should be used over
+    /// [`Self::encoded_batch_with_size`] if the consumer has no concerns 
about encoded message
+    /// size limits
+    pub fn encoded_batch(
+        &self,
+        batch: &RecordBatch,
+        dictionary_tracker: &mut DictionaryTracker,
+        write_options: &IpcWriteOptions,
+    ) -> Result<(Vec<EncodedData>, EncodedData), ArrowError> {
+        let (encoded_dictionaries, mut encoded_messages) =
+            self.encoded_batch_with_size(batch, dictionary_tracker, 
write_options, usize::MAX)?;
+
+        assert_eq!(
+            encoded_messages.len(),
+            1,
+            "encoded_batch with max size of usize::MAX should not be able to 
return more or less than 1 batch"
+        );
+
+        Ok((encoded_dictionaries, encoded_messages.pop().unwrap()))
+    }
+
     /// Encodes a batch to a number of [EncodedData] items (dictionary batches 
+ the record batch).
     /// The [DictionaryTracker] keeps track of dictionaries with new 
`dict_id`s  (so they are only sent once)
     /// Make sure the [DictionaryTracker] is initialized at the start of the 
stream.
-    pub fn encoded_batch(
+    /// The `max_flight_data_size` is used to control how much space each 
encoded [`RecordBatch`] is

Review Comment:
   Since this is the IPC encoder (which can be used directly in addition to in 
Flight) what do you think about changing the name of `max_flight_data_size` to 
`max_encoded_data_size`?



##########
arrow-ipc/src/writer.rs:
##########
@@ -1414,44 +1314,73 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize 
{
     }
 }
 
-/// Common functionality for re-encoding offsets. Returns the new offsets as 
well as
-/// original start offset and length for use in slicing child data.
-fn reencode_offsets<O: OffsetSizeTrait>(
-    offsets: &Buffer,
-    data: &ArrayData,
-) -> (Buffer, usize, usize) {
-    let offsets_slice: &[O] = offsets.typed_data::<O>();
-    let offset_slice = &offsets_slice[data.offset()..data.offset() + 
data.len() + 1];
-
-    let start_offset = offset_slice.first().unwrap();
-    let end_offset = offset_slice.last().unwrap();
-
-    let offsets = match start_offset.as_usize() {
-        0 => offsets.clone(),
-        _ => offset_slice.iter().map(|x| *x - *start_offset).collect(),
-    };
-
-    let start_offset = start_offset.as_usize();
-    let end_offset = end_offset.as_usize();
-
-    (offsets, start_offset, end_offset - start_offset)
-}
-
-/// Returns the values and offsets [`Buffer`] for a ByteArray with offset type 
`O`
+/// Returns the offsets and values [`Buffer`]s for a ByteArray with offset 
type `O`
 ///
 /// In particular, this handles re-encoding the offsets if they don't start at 
`0`,
 /// slicing the values buffer as appropriate. This helps reduce the encoded
 /// size of sliced arrays, as values that have been sliced away are not encoded
-fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, 
Buffer) {
+///
+/// # Panics
+///
+/// Panics if self.buffers does not contain at least 2 buffers (this code 
expects that the
+/// first will contain the offsets for this variable-length array and the 
other will contain
+/// the values)
+pub fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> 
(Buffer, Buffer) {
     if data.is_empty() {
         return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into());
     }
 
-    let (offsets, original_start_offset, len) = 
reencode_offsets::<O>(&data.buffers()[0], data);
+    // get the buffer of offsets, now shifted so they are shifted to be 
accurate to the slice
+    // of values that we'll be taking (e.g. if they previously said [0, 3, 5, 
7], but we slice
+    // to only get the last offset, they'll be shifted to be [0, 2], since 
that would be the
+    // offset pair for the last value in this shifted slice).
+    // also, in this example, original_start_offset would be 5 and len would 
be 2.
+    let (offsets, original_start_offset, len) = reencode_offsets::<O>(data);
     let values = data.buffers()[1].slice_with_length(original_start_offset, 
len);
     (offsets, values)
 }
 
+/// Common functionality for re-encoding offsets. Returns the new offsets as 
well as
+/// original start offset and length for use in slicing child data.
+///
+/// # Panics
+///
+/// Will panic if you call this on an `ArrayData` that does not have a buffer 
of offsets as the
+/// very first buffer (i.e. expects this to be a valid variable-length array)
+pub fn reencode_offsets<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, 
usize, usize) {

Review Comment:
   does this need to be pub?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Prevent FlightData overflowing max size limit whenever possible. [arrow-rs]

Reply via email to