This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 883c13ae217 Add more invalid utf8 parquet reader tests (#5639)
883c13ae217 is described below
commit 883c13ae2177f9b029452df5cc9d9c119fa989d3
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Apr 15 07:57:55 2024 -0400
Add more invalid utf8 parquet reader tests (#5639)
* Add more invalid utf8 reader tests
* Improve comments
---
parquet/src/arrow/arrow_reader/mod.rs | 126 +++++++++++++++++++++++++++++++++-
1 file changed, 124 insertions(+), 2 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 3b4c0931437..78d0fd6da8a 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -752,11 +752,11 @@ mod tests {
use arrow_array::*;
use arrow_buffer::{i256, ArrowNativeType, Buffer};
use arrow_data::ArrayDataBuilder;
- use arrow_schema::{DataType as ArrowDataType, Field, Fields, Schema};
+ use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, Fields,
Schema};
use arrow_select::concat::concat_batches;
use crate::arrow::arrow_reader::{
- ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReader,
+ ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderOptions,
ParquetRecordBatchReader,
ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, RowSelector,
};
use crate::arrow::schema::add_encoded_arrow_schema_to_metadata;
@@ -2169,6 +2169,128 @@ mod tests {
);
}
+ #[test]
+ fn test_invalid_utf8_string_array() {
+ test_invalid_utf8_string_array_inner::<i32>();
+ }
+ #[test]
+ fn test_invalid_utf8_large_string_array() {
+ test_invalid_utf8_string_array_inner::<i64>();
+ }
+ fn test_invalid_utf8_string_array_inner<O: OffsetSizeTrait>() {
+ let cases = [
+ (
+ invalid_utf8_first_char::<O>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data",
+ ),
+ (
+ invalid_utf8_later_char::<O>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 6",
+ ),
+ ];
+ for (array, expected_error) in cases {
+ // data is not valid utf8 we can not construct a correct
StringArray
+ // safely, so purposely create an invalid StringArray
+ let array = unsafe {
+ GenericStringArray::<O>::new_unchecked(
+ array.offsets().clone(),
+ array.values().clone(),
+ array.nulls().cloned(),
+ )
+ };
+ let data_type = array.data_type().clone();
+ let data = write_to_parquet(Arc::new(array));
+ let err = read_from_parquet(data).unwrap_err();
+ assert_eq!(err.to_string(), expected_error, "data type:
{data_type:?}")
+ }
+ }
+
+ #[test]
+ fn test_invalid_utf8_string_view_array() {
+ let cases = [
+ (
+ invalid_utf8_first_char::<i32>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data",
+ ),
+ (
+ invalid_utf8_later_char::<i32>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 6",
+ ),
+ ];
+ for (array, expected_error) in cases {
+ // cast not yet implemented for BinaryView
+ // https://github.com/apache/arrow-rs/issues/5508
+ // so copy directly
+ let mut builder = BinaryViewBuilder::with_capacity(100);
+ for v in array.iter() {
+ if let Some(v) = v {
+ builder.append_value(v);
+ } else {
+ builder.append_null();
+ }
+ }
+ let array = builder.finish();
+
+ // data is not valid utf8 we can not construct a correct
StringArray
+ // safely, so purposely create an invalid StringArray
+ let array = unsafe {
+ StringViewArray::new_unchecked(
+ array.views().clone(),
+ array.data_buffers().to_vec(),
+ array.nulls().cloned(),
+ )
+ };
+ let data_type = array.data_type().clone();
+ let data = write_to_parquet(Arc::new(array));
+ let err = read_from_parquet(data).unwrap_err();
+ assert_eq!(err.to_string(), expected_error, "data type:
{data_type:?}")
+ }
+ }
+
+ /// returns a BinaryArray with invalid UTF8 data in the first character
+ fn invalid_utf8_first_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
+ // invalid sequence in the first character
+ //
https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
+ let valid: &[u8] = b" ";
+ let invalid: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
+ GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(invalid)])
+ }
+
+ /// returns a BinaryArray with invalid UTF8 data in a character other than
+ /// the first (this is checked in a special codepath)
+ fn invalid_utf8_later_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
+ // invalid sequence in NOT the first character
+ //
https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
+ let valid: &[u8] = b" ";
+ let invalid: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];
+ GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(invalid)])
+ }
+
+ // writes the array into a single column parquet file
+ fn write_to_parquet(array: ArrayRef) -> Vec<u8> {
+ let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap();
+ let mut data = vec![];
+ let schema = batch.schema();
+ let props = None;
+ {
+ let mut writer = ArrowWriter::try_new(&mut data, schema,
props).unwrap();
+ writer.write(&batch).unwrap();
+ writer.flush().unwrap();
+ writer.close().unwrap();
+ };
+ data
+ }
+
+ /// read the parquet file into a record batch
+ fn read_from_parquet(data: Vec<u8>) -> Result<Vec<RecordBatch>,
ArrowError> {
+ let reader = ArrowReaderBuilder::try_new(bytes::Bytes::from(data))
+ .unwrap()
+ .build()
+ .unwrap();
+
+ reader.collect()
+ }
+
#[test]
fn test_dictionary_preservation() {
let fields = vec![Arc::new(