alamb commented on code in PR #7096:
URL: https://github.com/apache/arrow-rs/pull/7096#discussion_r1946933691


##########
arrow-ipc/src/reader.rs:
##########
@@ -1744,27 +1745,73 @@ mod tests {
         });
     }
 
-    fn roundtrip_ipc(rb: &RecordBatch) -> RecordBatch {
+    /// Write the record batch to an in-memory buffer in IPC File format

Review Comment:
   I just refactored these helpers into smaller chunks



##########
arrow-ipc/src/reader.rs:
##########
@@ -2492,4 +2539,109 @@ mod tests {
                 assert_eq!(decoded_batch.expect("Failed to read RecordBatch"), 
batch);
             });
     }
+
+    #[test]
+    fn test_validation_of_invalid_list_array() {

Review Comment:
   Any thoughts on how I can make an invalid `PrimitiveArray` would be most 
apprecaited.
   
   Anything i tried to mismatch the len and the actual data, resulted in panics 
in bounds checks (even with the `try_new_unchecked`). 
   
   I think it is probably a good thing that it is so hard to create invalid 
arrays but it would be nice to test this path



##########
arrow-ipc/src/reader.rs:
##########
@@ -1744,27 +1745,73 @@ mod tests {
         });
     }
 
-    fn roundtrip_ipc(rb: &RecordBatch) -> RecordBatch {
+    /// Write the record batch to an in-memory buffer in IPC File format
+    fn write_ipc(rb: &RecordBatch) -> Vec<u8> {
         let mut buf = Vec::new();
         let mut writer = crate::writer::FileWriter::try_new(&mut buf, 
rb.schema_ref()).unwrap();
         writer.write(rb).unwrap();
         writer.finish().unwrap();
-        drop(writer);
+        buf
+    }
 
-        let mut reader = FileReader::try_new(std::io::Cursor::new(buf), 
None).unwrap();
-        reader.next().unwrap().unwrap()
+    /// Return the first record batch read from the IPC File buffer
+    fn read_ipc(buf: &[u8]) -> Result<RecordBatch, ArrowError> {
+        let mut reader = FileReader::try_new(std::io::Cursor::new(buf), None)?;
+        reader.next().unwrap()
     }
 
-    fn roundtrip_ipc_stream(rb: &RecordBatch) -> RecordBatch {
+    fn roundtrip_ipc(rb: &RecordBatch) -> RecordBatch {
+        let buf = write_ipc(rb);
+        read_ipc(&buf).unwrap()
+    }
+
+    /// Return the first record batch read from the IPC File buffer
+    /// using the FileDecoder API
+    fn read_ipc_with_decoder(buf: Vec<u8>) -> Result<RecordBatch, ArrowError> {

Review Comment:
   Every time I find myself using `FileDecoder` I end up copy/pasting the 
example. 
   
   I am starting to think having something like 
https://github.com/apache/arrow-rs/blob/468e992a81a5b8bf493b6073df66745f3dd6a4f0/arrow/examples/zero_copy_ipc.rs#L84-L96
 in the actual crate might be useful
   
   (given I am about to go copy/paste it again into comet...)



##########
arrow-ipc/src/reader.rs:
##########
@@ -2492,4 +2539,109 @@ mod tests {
                 assert_eq!(decoded_batch.expect("Failed to read RecordBatch"), 
batch);
             });
     }
+
+    #[test]
+    fn test_validation_of_invalid_list_array() {
+        // ListArray with invalid offsets
+        let array = unsafe {
+            let values = Int32Array::from(vec![1, 2, 3]);
+            let bad_offsets = ScalarBuffer::<i32>::from(vec![0, 2, 4, 2]); // 
offsets can't go backwards
+            let offsets = OffsetBuffer::new_unchecked(bad_offsets); // INVALID 
array created
+            let field = Field::new_list_field(DataType::Int32, true);
+            let nulls = None;
+            ListArray::new(Arc::new(field), offsets, Arc::new(values), nulls)
+        };
+
+        expect_ipc_validation_error(
+            Arc::new(array),
+            "Invalid argument error: Offset invariant failure: offset at 
position 2 out of bounds: 4 > 2"
+        );
+    }
+
+    #[test]
+    fn test_validation_of_invalid_string_array() {
+        let valid: &[u8] = b"   ";
+        let mut invalid = vec![];
+        invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
+        invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
+        let binary_array = BinaryArray::from_iter(vec![None, Some(valid), 
None, Some(&invalid)]);
+        // data is not valid utf8 we can not construct a correct StringArray
+        // safely, so purposely create an invalid StringArray
+        let array = unsafe {
+            StringArray::new_unchecked(
+                binary_array.offsets().clone(),
+                binary_array.values().clone(),
+                binary_array.nulls().cloned(),
+            )
+        };
+        expect_ipc_validation_error(
+            Arc::new(array),
+            "Invalid argument error: Invalid UTF8 sequence at string index 3 
(3..45): invalid utf-8 sequence of 1 bytes from index 38"
+        );
+    }
+
+    #[test]
+    fn test_validation_of_invalid_string_view_array() {
+        let valid: &[u8] = b"   ";
+        let mut invalid = vec![];
+        invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
+        invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
+        let binary_view_array =
+            BinaryViewArray::from_iter(vec![None, Some(valid), None, 
Some(&invalid)]);
+        // data is not valid utf8 we can not construct a correct StringArray
+        // safely, so purposely create an invalid StringArray
+        let array = unsafe {
+            StringViewArray::new_unchecked(
+                binary_view_array.views().clone(),
+                binary_view_array.data_buffers().to_vec(),
+                binary_view_array.nulls().cloned(),
+            )
+        };
+        expect_ipc_validation_error(
+            Arc::new(array),
+            "Invalid argument error: Encountered non-UTF-8 data at index 3: 
invalid utf-8 sequence of 1 bytes from index 38"
+        );
+    }
+
+    /// return an invalid dictionary array (key is larger than values)
+    /// ListArray with invalid offsets
+    #[test]
+    fn test_validation_of_invalid_dictionary_array() {
+        let array = unsafe {
+            let values = StringArray::from_iter_values(["a", "b", "c"]);
+            let keys = Int32Array::from(vec![1, 200]); // keys are not valid 
for values
+            DictionaryArray::new_unchecked(keys, Arc::new(values))
+        };
+
+        expect_ipc_validation_error(
+            Arc::new(array),
+            "Invalid argument error: Value at position 1 out of bounds: 200 
(should be in [0, 2])",
+        );
+    }
+
+    /// Invalid Utf-8 sequence in the first character
+    /// 
<https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
+    const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
+
+    /// Expect an error when reading the record batch using IPC or IPC Streams
+    fn expect_ipc_validation_error(array: ArrayRef, expected_err: &str) {
+        let rb = RecordBatch::try_from_iter([("a", array)]).unwrap();
+
+        // IPC Stream format
+        let buf = write_stream(&rb); // write is ok
+        let err = read_stream(&buf).unwrap_err();
+        assert_eq!(err.to_string(), expected_err);
+
+        // IPC File format
+        let buf = write_ipc(&rb); // write is ok
+        let err = read_ipc(&buf).unwrap_err();
+        assert_eq!(err.to_string(), expected_err);
+
+        // TODO verify there is no error when validation is disabled

Review Comment:
   I plan to add a few more lines here when we disable validation



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to