This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 883c13ae217 Add more invalid utf8 parquet reader tests (#5639)
883c13ae217 is described below

commit 883c13ae2177f9b029452df5cc9d9c119fa989d3
Author: Andrew Lamb <and...@nerdnetworks.org>
AuthorDate: Mon Apr 15 07:57:55 2024 -0400

    Add more invalid utf8 parquet reader tests (#5639)
    
    * Add more invalid utf8 reader tests
    
    * Improve comments
---
 parquet/src/arrow/arrow_reader/mod.rs | 126 +++++++++++++++++++++++++++++++++-
 1 file changed, 124 insertions(+), 2 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index 3b4c0931437..78d0fd6da8a 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -752,11 +752,11 @@ mod tests {
     use arrow_array::*;
     use arrow_buffer::{i256, ArrowNativeType, Buffer};
     use arrow_data::ArrayDataBuilder;
-    use arrow_schema::{DataType as ArrowDataType, Field, Fields, Schema};
+    use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, Fields, 
Schema};
     use arrow_select::concat::concat_batches;
 
     use crate::arrow::arrow_reader::{
-        ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReader,
+        ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions, 
ParquetRecordBatchReader,
         ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, RowSelector,
     };
     use crate::arrow::schema::add_encoded_arrow_schema_to_metadata;
@@ -2169,6 +2169,128 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_invalid_utf8_string_array() {
+        test_invalid_utf8_string_array_inner::<i32>();
+    }
+    #[test]
+    fn test_invalid_utf8_large_string_array() {
+        test_invalid_utf8_string_array_inner::<i64>();
+    }
+    fn test_invalid_utf8_string_array_inner<O: OffsetSizeTrait>() {
+        let cases = [
+            (
+                invalid_utf8_first_char::<O>(),
+                "Parquet argument error: Parquet error: encountered non UTF-8 
data",
+            ),
+            (
+                invalid_utf8_later_char::<O>(),
+                "Parquet argument error: Parquet error: encountered non UTF-8 
data: invalid utf-8 sequence of 1 bytes from index 6",
+            ),
+        ];
+        for (array, expected_error) in cases {
+            // data is not valid utf8 we can not construct a correct 
StringArray
+            // safely, so purposely create an invalid StringArray
+            let array = unsafe {
+                GenericStringArray::<O>::new_unchecked(
+                    array.offsets().clone(),
+                    array.values().clone(),
+                    array.nulls().cloned(),
+                )
+            };
+            let data_type = array.data_type().clone();
+            let data = write_to_parquet(Arc::new(array));
+            let err = read_from_parquet(data).unwrap_err();
+            assert_eq!(err.to_string(), expected_error, "data type: 
{data_type:?}")
+        }
+    }
+
+    #[test]
+    fn test_invalid_utf8_string_view_array() {
+        let cases = [
+            (
+                invalid_utf8_first_char::<i32>(),
+                "Parquet argument error: Parquet error: encountered non UTF-8 
data",
+            ),
+            (
+                invalid_utf8_later_char::<i32>(),
+                "Parquet argument error: Parquet error: encountered non UTF-8 
data: invalid utf-8 sequence of 1 bytes from index 6",
+            ),
+        ];
+        for (array, expected_error) in cases {
+            // cast not yet implemented for BinaryView
+            // https://github.com/apache/arrow-rs/issues/5508
+            // so copy directly
+            let mut builder = BinaryViewBuilder::with_capacity(100);
+            for v in array.iter() {
+                if let Some(v) = v {
+                    builder.append_value(v);
+                } else {
+                    builder.append_null();
+                }
+            }
+            let array = builder.finish();
+
+            // data is not valid utf8 we can not construct a correct 
StringArray
+            // safely, so purposely create an invalid StringArray
+            let array = unsafe {
+                StringViewArray::new_unchecked(
+                    array.views().clone(),
+                    array.data_buffers().to_vec(),
+                    array.nulls().cloned(),
+                )
+            };
+            let data_type = array.data_type().clone();
+            let data = write_to_parquet(Arc::new(array));
+            let err = read_from_parquet(data).unwrap_err();
+            assert_eq!(err.to_string(), expected_error, "data type: 
{data_type:?}")
+        }
+    }
+
+    /// returns a BinaryArray with invalid UTF8 data in the first character
+    fn invalid_utf8_first_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
+        // invalid sequence in the first character
+        // 
https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
+        let valid: &[u8] = b"   ";
+        let invalid: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
+        GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, 
Some(invalid)])
+    }
+
+    /// returns a BinaryArray with invalid UTF8 data in a character other than
+    /// the first (this is checked in a special codepath)
+    fn invalid_utf8_later_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
+        // invalid sequence in NOT the first character
+        // 
https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
+        let valid: &[u8] = b"   ";
+        let invalid: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];
+        GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, 
Some(invalid)])
+    }
+
+    // writes the array into a single column parquet file
+    fn write_to_parquet(array: ArrayRef) -> Vec<u8> {
+        let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap();
+        let mut data = vec![];
+        let schema = batch.schema();
+        let props = None;
+        {
+            let mut writer = ArrowWriter::try_new(&mut data, schema, 
props).unwrap();
+            writer.write(&batch).unwrap();
+            writer.flush().unwrap();
+            writer.close().unwrap();
+        };
+        data
+    }
+
+    /// read the parquet file into a record batch
+    fn read_from_parquet(data: Vec<u8>) -> Result<Vec<RecordBatch>, 
ArrowError> {
+        let reader = ArrowReaderBuilder::try_new(bytes::Bytes::from(data))
+            .unwrap()
+            .build()
+            .unwrap();
+
+        reader.collect()
+    }
+
     #[test]
     fn test_dictionary_preservation() {
         let fields = vec![Arc::new(

Reply via email to