alamb commented on code in PR #7120:
URL: https://github.com/apache/arrow-rs/pull/7120#discussion_r1952498517


##########
arrow-data/src/data.rs:
##########
@@ -1781,33 +1780,61 @@ impl PartialEq for ArrayData {
     }
 }
 
-mod private {
-    /// A boolean flag that cannot be mutated outside of unsafe code.
+/// A boolean flag that cannot be mutated outside of unsafe code.

Review Comment:
   I propose to make this `UnsafeFlag` public  (and added examples and more 
docs) so I could use it across the two crates. However, I can also make a 
private copy of it in arrow-ipc if reviewers feel it would be better to avoid a 
new API



##########
arrow-ipc/src/reader.rs:
##########
@@ -136,40 +136,20 @@ impl RecordBatchDecoder<'_> {
                     let child = self.create_array(struct_field, 
variadic_counts)?;
                     struct_arrays.push(child);
                 }
-                let null_count = struct_node.null_count() as usize;
-                let struct_array = if struct_arrays.is_empty() {
-                    // `StructArray::from` can't infer the correct row count
-                    // if we have zero fields
-                    let len = struct_node.length() as usize;
-                    StructArray::new_empty_fields(
-                        len,
-                        (null_count > 0).then(|| 
BooleanBuffer::new(null_buffer, 0, len).into()),
-                    )
-                } else if null_count > 0 {
-                    // create struct array from fields, arrays and null data
-                    let len = struct_node.length() as usize;
-                    let nulls = BooleanBuffer::new(null_buffer, 0, len).into();
-                    StructArray::try_new(struct_fields.clone(), struct_arrays, 
Some(nulls))?
-                } else {
-                    StructArray::try_new(struct_fields.clone(), struct_arrays, 
None)?
-                };
-                Ok(Arc::new(struct_array))
+                self.create_struct_array(struct_node, null_buffer, 
struct_fields, struct_arrays)

Review Comment:
   I refactored this code into its own function so it was eaiser to call 
`StructArray::new_unchecked` when validation was disabled



##########
arrow-ipc/benches/ipc_reader.rs:
##########
@@ -24,25 +24,34 @@ use arrow_ipc::writer::{FileWriter, IpcWriteOptions, 
StreamWriter};
 use arrow_ipc::{root_as_footer, Block, CompressionType};
 use arrow_schema::{DataType, Field, Schema};
 use criterion::{criterion_group, criterion_main, Criterion};
-use std::io::Cursor;
+use std::io::{Cursor, Write};
 use std::sync::Arc;
 use tempfile::tempdir;
 
 fn criterion_benchmark(c: &mut Criterion) {
     let mut group = c.benchmark_group("arrow_ipc_reader");
 
     group.bench_function("StreamReader/read_10", |b| {
-        let batch = create_batch(8192, true);
-        let mut buffer = Vec::with_capacity(2 * 1024 * 1024);
-        let mut writer = StreamWriter::try_new(&mut buffer, 
batch.schema().as_ref()).unwrap();
-        for _ in 0..10 {
-            writer.write(&batch).unwrap();
-        }
-        writer.finish().unwrap();
+        let buffer = ipc_stream();

Review Comment:
   I added new versions of each benchmark that work with disabled validation



##########
arrow-ipc/src/reader.rs:
##########
@@ -1462,6 +1546,16 @@ impl<R: Read> StreamReader<R> {
     pub fn get_mut(&mut self) -> &mut R {
         &mut self.reader
     }
+
+    /// Specifies if validation should be skipped when reading data (defaults 
to `false`)

Review Comment:
   new API



##########
arrow-ipc/src/reader.rs:
##########
@@ -2602,18 +2807,18 @@ mod tests {
 
         // IPC Stream format
         let buf = write_stream(&rb); // write is ok
+        read_stream_skip_validation(&buf).unwrap();

Review Comment:
   Now all the validation tests *also* verify they can read the batch back 
without error if validation is disabled



##########
arrow-ipc/src/reader.rs:
##########
@@ -432,6 +458,17 @@ impl<'a> RecordBatchDecoder<'a> {
         self
     }
 
+    /// Specifies if validation should be skipped when reading data (defaults 
to `false`)
+    ///
+    /// # Safety
+    ///
+    /// Relies on the caller only passing a flag with `true` value if they are
+    /// certain that the data is valid
+    pub fn with_skip_validation(mut self, skip_validation: UnsafeFlag) -> Self 
{

Review Comment:
   Note that `RecordBatchDecoder` is not a public API:
    
https://docs.rs/arrow-ipc/54.1.0/arrow_ipc/reader/struct.StreamReader.html?search=RecordBatchDecoder



##########
arrow-ipc/src/reader.rs:
##########
@@ -2456,6 +2584,57 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_invalid_nested_array_ipc_read_errors() {

Review Comment:
   I added some additional coverage to make sure the flag got passed when 
decoding multiple nesting levels



##########
arrow-ipc/src/reader.rs:
##########
@@ -1177,6 +1243,16 @@ impl<R: Read + Seek> FileReader<R> {
     pub fn get_mut(&mut self) -> &mut R {
         &mut self.reader
     }
+
+    /// Specifies if validation should be skipped when reading data (defaults 
to `false`)

Review Comment:
   new API



##########
arrow-ipc/src/reader.rs:
##########
@@ -2592,6 +2771,32 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_validation_of_invalid_union_array() {

Review Comment:
   Turns out UnionArray had its own path / handling so new coverage added



##########
arrow-ipc/src/reader.rs:
##########
@@ -809,6 +858,21 @@ impl FileDecoder {
         self
     }
 
+    /// Specifies if validation should be skipped when reading data (defaults 
to `false`)

Review Comment:
   this is a new public API and follows the same pattern as 
`ArrayData::skip_validation`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to