alamb commented on a change in pull request #810:
URL: https://github.com/apache/arrow-rs/pull/810#discussion_r719790489



##########
File path: arrow/src/array/data.rs
##########
@@ -188,6 +188,140 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: 
usize) -> [MutableBuff
     }
 }
 
+enum SizeNeed {
+    /// type needs a single buffer that has at least this many bytes
+    SingleBuffer(usize),
+    Compound,
+}
+
+/// Validates that buffers are sufficiently sized, to store `n` elements of 
the specified type
+///
+/// Note for types that require multiple buffers, this only returns the size 
for buffer 0
+#[inline]
+fn min_size_needed(data_type: &DataType, n: usize) -> SizeNeed {
+    use SizeNeed::*;
+    match data_type {
+        DataType::Null => SingleBuffer(0),
+        DataType::Boolean => SingleBuffer(bit_util::ceil(n, 8)),
+        DataType::UInt8 => SingleBuffer(n * mem::size_of::<u8>()),
+        DataType::UInt16 => SingleBuffer(n * mem::size_of::<u16>()),
+        DataType::UInt32 => SingleBuffer(n * mem::size_of::<u32>()),
+        DataType::UInt64 => SingleBuffer(n * mem::size_of::<u64>()),
+        DataType::Int8 => SingleBuffer(n * mem::size_of::<i8>()),
+        DataType::Int16 => SingleBuffer(n * mem::size_of::<i16>()),
+        DataType::Int32 => SingleBuffer(n * mem::size_of::<i32>()),
+        DataType::Int64 => SingleBuffer(n * mem::size_of::<i64>()),
+        DataType::Float32 => SingleBuffer(n * mem::size_of::<f32>()),
+        DataType::Float64 => SingleBuffer(n * mem::size_of::<f64>()),
+        DataType::Date32 | DataType::Time32(_) => SingleBuffer(n * 
mem::size_of::<i32>()),
+        DataType::Date64
+        | DataType::Time64(_)
+        | DataType::Duration(_)
+        | DataType::Timestamp(_, _) => SingleBuffer(n * mem::size_of::<i64>()),
+        DataType::Interval(IntervalUnit::YearMonth) => {
+            SingleBuffer(n * mem::size_of::<i32>())
+        }
+        DataType::Interval(IntervalUnit::DayTime) => {
+            SingleBuffer(n * mem::size_of::<i64>())
+        }
+        DataType::Float16 => unimplemented!(),
+        DataType::Binary => Compound,
+        DataType::FixedSizeBinary(_) => Compound,
+        DataType::LargeBinary => Compound,
+        DataType::Utf8 => Compound,
+        DataType::LargeUtf8 => Compound,
+        DataType::List(_) => Compound,
+        DataType::FixedSizeList(_, _) => Compound,
+        DataType::LargeList(_) => Compound,
+        DataType::Struct(_) => Compound,
+        DataType::Union(_) => Compound,
+        DataType::Dictionary(_, _) => Compound,
+        DataType::Decimal(_, _) => Compound,
+        DataType::Map(_, _) => Compound,
+    }
+
+    // DataType::Utf8 | DataType::Binary => {
+    //         // 
https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout
+    //         // these have offsets + data buffer
+    //         // return size for offsets buffer
+    //         (1 + n) * mem::size_of::<i32>()
+    //     }
+    //     DataType::LargeUtf8 | DataType::LargeBinary => {
+    //         // 
https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout
+    //         // these have offsets + data buffer
+    //         // return size for offsets buffer
+    //         (1 + n) * mem::size_of::<i64>()
+    //     }
+    //     DataType::List(_) | DataType::Map(_, _) => {
+    //         // 
https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout
+    //         // these have offsets + data buffer
+    //         // return size for offsets buffer
+    //         (1 + n) * mem::size_of::<i32>()
+    //     }
+    //     DataType::LargeList(_) => {
+    //         // 
https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout
+    //         // these have offsets + data buffer
+    //         // return size for offsets buffer
+    //         (1 + n) * mem::size_of::<i64>()
+    //     }
+    //     DataType::FixedSizeBinary(size) => {
+    //         n * *size as usize
+    //     }
+    //     DataType::Dictionary(key_type, _value_type) => {
+    //         // dictionary's first buffer is the keys array
+    //         buffer0_min_size(key_type.as_ref(), n)
+    //     },
+    //     DataType::Float16 => unreachable!(),
+    //     DataType::FixedSizeList(field, field_length) => {
+    //         // NOTE THIS IS DIFFERENT THAN USED IN new-buffers above
+    //         
//https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout
+    //         *field_length as usize * buffer0_min_size(field.data_type(), n)
+    //     },
+
+    //     DataType::Struct(fields) => {
+    //         
//https://arrow.apache.org/docs/format/Columnar.html#struct-layout
+    //         // first buffer is the first field size
+    //         //
+    //         // the usefulnes of this as validation is pretty
+    //         // low... TBD come up with a better API (maybe return an
+    //         // enum??)
+    //         fields.get(0)
+    //             .map(|first_field| {
+    //                 buffer0_min_size(first_field.data_type(), n)
+    //             })
+    //             .unwrap_or(0)
+    //     }
+    //     DataType::Decimal(_, _) => {
+    //         // 
https://arrow.apache.org/docs/format/Columnar.html#terminology
+    //         // "For example, Decimal values are stored as 16 bytes in a 
fixed-size binary layout. Similarly,"
+    //         n * 16 * mem::size_of::<u8>()
+    //     }
+    //     DataType::Union(_) => unimplemented!(),
+}
+
+/// Validates that buffers are sufficiently sized, to store `n`
+/// elements of `data_type`, panic'ing if this is not the case
+fn validate_buffer_sizes(data_type: &DataType, n: usize, buffers: &[Buffer]) {
+    match min_size_needed(data_type, n) {
+        SizeNeed::SingleBuffer(min_size) => {
+            println!(
+                "Validating data of type {:?}, n={}, buffers={}",
+                data_type,
+                n,
+                buffers.len()
+            );
+            // if min_size is zero, may not have buffers (e.g. NullArray)
+            assert!(min_size == 0 || buffers[0].len() >= min_size,

Review comment:
       This is the core validation check. 

##########
File path: arrow/src/array/array_primitive.rs
##########
@@ -869,7 +869,7 @@ mod tests {
     #[test]
     fn test_primitive_array_builder() {
         // Test building a primitive array with ArrayData builder and offset
-        let buf = Buffer::from_slice_ref(&[0, 1, 2, 3, 4]);
+        let buf = Buffer::from_slice_ref(&[0i32, 1, 2, 3, 4, 5, 6]);

Review comment:
       These tests need to be updated because they were creating invalid 
Buffers (this one for example had an len of 5 and offset of 2 but only passed 
in an array 5 long)




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to