This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 722dc643dc Moving invalid_utf8 tests into a separate mod (#9384)
722dc643dc is described below
commit 722dc643dcc0b5812027dac2e236994f726883da
Author: Kosta Tarasov <[email protected]>
AuthorDate: Tue Feb 10 13:57:12 2026 -0500
Moving invalid_utf8 tests into a separate mod (#9384)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Part of #9269.
# Rationale for this change
check issue
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
Moved invalid utf8 tests from `parquet/src/arrow/arrow_reader/mod.rs` ->
`parquet/tests/arrow_reader/invalid_utf8.rs`
# Are these changes tested?
code movement
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
# Are there any user-facing changes?
code movement
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---
parquet/src/arrow/arrow_reader/mod.rs | 243 +-------------------------
parquet/tests/arrow_reader/invalid_utf8.rs | 265 +++++++++++++++++++++++++++++
parquet/tests/arrow_reader/mod.rs | 1 +
3 files changed, 269 insertions(+), 240 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index d039841800..eb032dc1a7 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1580,9 +1580,8 @@ pub(crate) mod tests {
use tempfile::tempfile;
use crate::arrow::arrow_reader::{
- ArrowPredicateFn, ArrowReaderBuilder, ArrowReaderMetadata,
ArrowReaderOptions,
- ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder, RowFilter,
RowSelection,
- RowSelector,
+ ArrowPredicateFn, ArrowReaderMetadata, ArrowReaderOptions,
ParquetRecordBatchReader,
+ ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, RowSelector,
};
use crate::arrow::schema::{
add_encoded_arrow_schema_to_metadata,
@@ -1612,9 +1611,7 @@ pub(crate) mod tests {
use arrow_array::*;
use arrow_buffer::{ArrowNativeType, Buffer, IntervalDayTime, NullBuffer,
i256};
use arrow_data::{ArrayData, ArrayDataBuilder};
- use arrow_schema::{
- ArrowError, DataType as ArrowDataType, Field, Fields, Schema,
SchemaRef, TimeUnit,
- };
+ use arrow_schema::{DataType as ArrowDataType, Field, Fields, Schema,
SchemaRef, TimeUnit};
use arrow_select::concat::concat_batches;
use bytes::Bytes;
use half::f16;
@@ -3730,240 +3727,6 @@ pub(crate) mod tests {
assert_eq!(batch.column(0).null_count(), 2);
}
- #[test]
- fn test_invalid_utf8() {
- // a parquet file with 1 column with invalid utf8
- let data = vec![
- 80, 65, 82, 49, 21, 6, 21, 22, 21, 22, 92, 21, 2, 21, 0, 21, 2,
21, 0, 21, 4, 21, 0,
- 18, 28, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101,
255, 108, 111, 0, 0, 0,
- 3, 1, 5, 0, 0, 0, 104, 101, 255, 108, 111, 38, 110, 28, 21, 12,
25, 37, 6, 0, 25, 24,
- 2, 99, 49, 21, 0, 22, 2, 22, 102, 22, 102, 38, 8, 60, 54, 0, 40,
5, 104, 101, 255, 108,
- 111, 24, 5, 104, 101, 255, 108, 111, 0, 0, 0, 21, 4, 25, 44, 72,
4, 114, 111, 111, 116,
- 21, 2, 0, 21, 12, 37, 2, 24, 2, 99, 49, 37, 0, 76, 28, 0, 0, 0,
22, 2, 25, 28, 25, 28,
- 38, 110, 28, 21, 12, 25, 37, 6, 0, 25, 24, 2, 99, 49, 21, 0, 22,
2, 22, 102, 22, 102,
- 38, 8, 60, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101,
255, 108, 111, 0, 0,
- 0, 22, 102, 22, 2, 0, 40, 44, 65, 114, 114, 111, 119, 50, 32, 45,
32, 78, 97, 116, 105,
- 118, 101, 32, 82, 117, 115, 116, 32, 105, 109, 112, 108, 101, 109,
101, 110, 116, 97,
- 116, 105, 111, 110, 32, 111, 102, 32, 65, 114, 114, 111, 119, 0,
130, 0, 0, 0, 80, 65,
- 82, 49,
- ];
-
- let file = Bytes::from(data);
- let mut record_batch_reader = ParquetRecordBatchReader::try_new(file,
10).unwrap();
-
- let error = record_batch_reader.next().unwrap().unwrap_err();
-
- assert!(
- error.to_string().contains("invalid utf-8 sequence"),
- "{}",
- error
- );
- }
-
- #[test]
- fn test_invalid_utf8_string_array() {
- test_invalid_utf8_string_array_inner::<i32>();
- }
-
- #[test]
- fn test_invalid_utf8_large_string_array() {
- test_invalid_utf8_string_array_inner::<i64>();
- }
-
- fn test_invalid_utf8_string_array_inner<O: OffsetSizeTrait>() {
- let cases = [
- invalid_utf8_first_char::<O>(),
- invalid_utf8_first_char_long_strings::<O>(),
- invalid_utf8_later_char::<O>(),
- invalid_utf8_later_char_long_strings::<O>(),
- invalid_utf8_later_char_really_long_strings::<O>(),
- invalid_utf8_later_char_really_long_strings2::<O>(),
- ];
- for array in &cases {
- for encoding in STRING_ENCODINGS {
- // data is not valid utf8 we can not construct a correct
StringArray
- // safely, so purposely create an invalid StringArray
- let array = unsafe {
- GenericStringArray::<O>::new_unchecked(
- array.offsets().clone(),
- array.values().clone(),
- array.nulls().cloned(),
- )
- };
- let data_type = array.data_type().clone();
- let data = write_to_parquet_with_encoding(Arc::new(array),
*encoding);
- let err = read_from_parquet(data).unwrap_err();
- let expected_err =
- "Parquet argument error: Parquet error: encountered non
UTF-8 data";
- assert!(
- err.to_string().contains(expected_err),
- "data type: {data_type}, expected: {expected_err}, got:
{err}"
- );
- }
- }
- }
-
- #[test]
- fn test_invalid_utf8_string_view_array() {
- let cases = [
- invalid_utf8_first_char::<i32>(),
- invalid_utf8_first_char_long_strings::<i32>(),
- invalid_utf8_later_char::<i32>(),
- invalid_utf8_later_char_long_strings::<i32>(),
- invalid_utf8_later_char_really_long_strings::<i32>(),
- invalid_utf8_later_char_really_long_strings2::<i32>(),
- ];
-
- for encoding in STRING_ENCODINGS {
- for array in &cases {
- let array = arrow_cast::cast(&array,
&ArrowDataType::BinaryView).unwrap();
- let array = array.as_binary_view();
-
- // data is not valid utf8 we can not construct a correct
StringArray
- // safely, so purposely create an invalid StringViewArray
- let array = unsafe {
- StringViewArray::new_unchecked(
- array.views().clone(),
- array.data_buffers().to_vec(),
- array.nulls().cloned(),
- )
- };
-
- let data_type = array.data_type().clone();
- let data = write_to_parquet_with_encoding(Arc::new(array),
*encoding);
- let err = read_from_parquet(data).unwrap_err();
- let expected_err =
- "Parquet argument error: Parquet error: encountered non
UTF-8 data";
- assert!(
- err.to_string().contains(expected_err),
- "data type: {data_type}, expected: {expected_err}, got:
{err}"
- );
- }
- }
- }
-
- /// Encodings suitable for string data
- const STRING_ENCODINGS: &[Option<Encoding>] = &[
- None,
- Some(Encoding::PLAIN),
- Some(Encoding::DELTA_LENGTH_BYTE_ARRAY),
- Some(Encoding::DELTA_BYTE_ARRAY),
- ];
-
- /// Invalid Utf-8 sequence in the first character
- ///
<https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
- const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
-
- /// Invalid Utf=8 sequence in NOT the first character
- ///
<https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
- const INVALID_UTF8_LATER_CHAR: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1,
0x20, 0x20];
-
- /// returns a BinaryArray with invalid UTF8 data in the first character
- fn invalid_utf8_first_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
- let valid: &[u8] = b" ";
- let invalid = INVALID_UTF8_FIRST_CHAR;
- GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(invalid)])
- }
-
- /// Returns a BinaryArray with invalid UTF8 data in the first character of
a
- /// string larger than 12 bytes which is handled specially when reading
- /// `ByteViewArray`s
- fn invalid_utf8_first_char_long_strings<O: OffsetSizeTrait>() ->
GenericBinaryArray<O> {
- let valid: &[u8] = b" ";
- let mut invalid = vec![];
- invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
- invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
- GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(&invalid)])
- }
-
- /// returns a BinaryArray with invalid UTF8 data in a character other than
- /// the first (this is checked in a special codepath)
- fn invalid_utf8_later_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
- let valid: &[u8] = b" ";
- let invalid: &[u8] = INVALID_UTF8_LATER_CHAR;
- GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(invalid)])
- }
-
- /// returns a BinaryArray with invalid UTF8 data in a character other than
- /// the first in a string larger than 12 bytes which is handled specially
- /// when reading `ByteViewArray`s (this is checked in a special codepath)
- fn invalid_utf8_later_char_long_strings<O: OffsetSizeTrait>() ->
GenericBinaryArray<O> {
- let valid: &[u8] = b" ";
- let mut invalid = vec![];
- invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
- invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
- GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(&invalid)])
- }
-
- /// returns a BinaryArray with invalid UTF8 data in a character other than
- /// the first in a string larger than 128 bytes which is handled specially
- /// when reading `ByteViewArray`s (this is checked in a special codepath)
- fn invalid_utf8_later_char_really_long_strings<O: OffsetSizeTrait>() ->
GenericBinaryArray<O> {
- let valid: &[u8] = b" ";
- let mut invalid = vec![];
- for _ in 0..10 {
- // each instance is 38 bytes
-
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
- }
- invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
- GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(&invalid)])
- }
-
- /// returns a BinaryArray with small invalid UTF8 data followed by a large
- /// invalid UTF8 data in a character other than the first in a string
larger
- fn invalid_utf8_later_char_really_long_strings2<O: OffsetSizeTrait>() ->
GenericBinaryArray<O> {
- let valid: &[u8] = b" ";
- let mut valid_long = vec![];
- for _ in 0..10 {
- // each instance is 38 bytes
-
valid_long.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
- }
- let invalid = INVALID_UTF8_LATER_CHAR;
- GenericBinaryArray::<O>::from_iter(vec![
- None,
- Some(valid),
- Some(invalid),
- None,
- Some(&valid_long),
- Some(valid),
- ])
- }
-
- /// writes the array into a single column parquet file with the specified
- /// encoding.
- ///
- /// If no encoding is specified, use default (dictionary) encoding
- fn write_to_parquet_with_encoding(array: ArrayRef, encoding:
Option<Encoding>) -> Vec<u8> {
- let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap();
- let mut data = vec![];
- let schema = batch.schema();
- let props = encoding.map(|encoding| {
- WriterProperties::builder()
- // must disable dictionary encoding to actually use encoding
- .set_dictionary_enabled(false)
- .set_encoding(encoding)
- .build()
- });
-
- {
- let mut writer = ArrowWriter::try_new(&mut data, schema,
props).unwrap();
- writer.write(&batch).unwrap();
- writer.flush().unwrap();
- writer.close().unwrap();
- };
- data
- }
-
- /// read the parquet file into a record batch
- fn read_from_parquet(data: Vec<u8>) -> Result<Vec<RecordBatch>,
ArrowError> {
- let reader = ArrowReaderBuilder::try_new(bytes::Bytes::from(data))
- .unwrap()
- .build()
- .unwrap();
-
- reader.collect()
- }
-
#[test]
fn test_dictionary_preservation() {
let fields = vec![Arc::new(
diff --git a/parquet/tests/arrow_reader/invalid_utf8.rs
b/parquet/tests/arrow_reader/invalid_utf8.rs
new file mode 100644
index 0000000000..1124737a7f
--- /dev/null
+++ b/parquet/tests/arrow_reader/invalid_utf8.rs
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow::array::AsArray;
+use arrow_array::{
+ Array, ArrayRef, GenericBinaryArray, GenericStringArray, OffsetSizeTrait,
RecordBatch,
+ StringViewArray,
+};
+use arrow_schema::{ArrowError, DataType};
+use bytes::Bytes;
+use parquet::{
+ arrow::{
+ ArrowWriter,
+ arrow_reader::{ArrowReaderBuilder, ParquetRecordBatchReader},
+ },
+ basic::Encoding,
+ file::properties::WriterProperties,
+};
+
+#[test]
+fn test_invalid_utf8() {
+ // a parquet file with 1 column with invalid utf8
+ let data = vec![
+ 80, 65, 82, 49, 21, 6, 21, 22, 21, 22, 92, 21, 2, 21, 0, 21, 2, 21, 0,
21, 4, 21, 0, 18,
+ 28, 54, 0, 40, 5, 104, 101, 255, 108, 111, 24, 5, 104, 101, 255, 108,
111, 0, 0, 0, 3, 1,
+ 5, 0, 0, 0, 104, 101, 255, 108, 111, 38, 110, 28, 21, 12, 25, 37, 6,
0, 25, 24, 2, 99, 49,
+ 21, 0, 22, 2, 22, 102, 22, 102, 38, 8, 60, 54, 0, 40, 5, 104, 101,
255, 108, 111, 24, 5,
+ 104, 101, 255, 108, 111, 0, 0, 0, 21, 4, 25, 44, 72, 4, 114, 111, 111,
116, 21, 2, 0, 21,
+ 12, 37, 2, 24, 2, 99, 49, 37, 0, 76, 28, 0, 0, 0, 22, 2, 25, 28, 25,
28, 38, 110, 28, 21,
+ 12, 25, 37, 6, 0, 25, 24, 2, 99, 49, 21, 0, 22, 2, 22, 102, 22, 102,
38, 8, 60, 54, 0, 40,
+ 5, 104, 101, 255, 108, 111, 24, 5, 104, 101, 255, 108, 111, 0, 0, 0,
22, 102, 22, 2, 0, 40,
+ 44, 65, 114, 114, 111, 119, 50, 32, 45, 32, 78, 97, 116, 105, 118,
101, 32, 82, 117, 115,
+ 116, 32, 105, 109, 112, 108, 101, 109, 101, 110, 116, 97, 116, 105,
111, 110, 32, 111, 102,
+ 32, 65, 114, 114, 111, 119, 0, 130, 0, 0, 0, 80, 65, 82, 49,
+ ];
+
+ let file = Bytes::from(data);
+ let mut record_batch_reader = ParquetRecordBatchReader::try_new(file,
10).unwrap();
+
+ let error = record_batch_reader.next().unwrap().unwrap_err();
+
+ assert!(
+ error.to_string().contains("invalid utf-8 sequence"),
+ "{}",
+ error
+ );
+}
+
+#[test]
+fn test_invalid_utf8_string_array() {
+ test_invalid_utf8_string_array_inner::<i32>();
+}
+
+#[test]
+fn test_invalid_utf8_large_string_array() {
+ test_invalid_utf8_string_array_inner::<i64>();
+}
+
+fn test_invalid_utf8_string_array_inner<O: OffsetSizeTrait>() {
+ let cases = [
+ invalid_utf8_first_char::<O>(),
+ invalid_utf8_first_char_long_strings::<O>(),
+ invalid_utf8_later_char::<O>(),
+ invalid_utf8_later_char_long_strings::<O>(),
+ invalid_utf8_later_char_really_long_strings::<O>(),
+ invalid_utf8_later_char_really_long_strings2::<O>(),
+ ];
+ for array in &cases {
+ for encoding in STRING_ENCODINGS {
+ // data is not valid utf8 we can not construct a correct
StringArray
+ // safely, so purposely create an invalid StringArray
+ let array = unsafe {
+ GenericStringArray::<O>::new_unchecked(
+ array.offsets().clone(),
+ array.values().clone(),
+ array.nulls().cloned(),
+ )
+ };
+ let data_type = array.data_type().clone();
+ let data = write_to_parquet_with_encoding(Arc::new(array),
*encoding);
+ let err = read_from_parquet(data).unwrap_err();
+ let expected_err = "Parquet argument error: Parquet error:
encountered non UTF-8 data";
+ assert!(
+ err.to_string().contains(expected_err),
+ "data type: {data_type}, expected: {expected_err}, got: {err}"
+ );
+ }
+ }
+}
+
+#[test]
+fn test_invalid_utf8_string_view_array() {
+ let cases = [
+ invalid_utf8_first_char::<i32>(),
+ invalid_utf8_first_char_long_strings::<i32>(),
+ invalid_utf8_later_char::<i32>(),
+ invalid_utf8_later_char_long_strings::<i32>(),
+ invalid_utf8_later_char_really_long_strings::<i32>(),
+ invalid_utf8_later_char_really_long_strings2::<i32>(),
+ ];
+
+ for encoding in STRING_ENCODINGS {
+ for array in &cases {
+ let array = arrow_cast::cast(&array,
&DataType::BinaryView).unwrap();
+ let array = array.as_binary_view();
+
+ // data is not valid utf8 we can not construct a correct
StringArray
+ // safely, so purposely create an invalid StringViewArray
+ let array = unsafe {
+ StringViewArray::new_unchecked(
+ array.views().clone(),
+ array.data_buffers().to_vec(),
+ array.nulls().cloned(),
+ )
+ };
+
+ let data_type = array.data_type().clone();
+ let data = write_to_parquet_with_encoding(Arc::new(array),
*encoding);
+ let err = read_from_parquet(data).unwrap_err();
+ let expected_err = "Parquet argument error: Parquet error:
encountered non UTF-8 data";
+ assert!(
+ err.to_string().contains(expected_err),
+ "data type: {data_type}, expected: {expected_err}, got: {err}"
+ );
+ }
+ }
+}
+
+/// Encodings suitable for string data
+const STRING_ENCODINGS: &[Option<Encoding>] = &[
+ None,
+ Some(Encoding::PLAIN),
+ Some(Encoding::DELTA_LENGTH_BYTE_ARRAY),
+ Some(Encoding::DELTA_BYTE_ARRAY),
+];
+
+/// Invalid Utf-8 sequence in the first character
+/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
+const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
+
+/// Invalid Utf=8 sequence in NOT the first character
+/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
+const INVALID_UTF8_LATER_CHAR: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20,
0x20];
+
+/// returns a BinaryArray with invalid UTF8 data in the first character
+fn invalid_utf8_first_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
+ let valid: &[u8] = b" ";
+ let invalid = INVALID_UTF8_FIRST_CHAR;
+ GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(invalid)])
+}
+
+/// Returns a BinaryArray with invalid UTF8 data in the first character of a
+/// string larger than 12 bytes which is handled specially when reading
+/// `ByteViewArray`s
+fn invalid_utf8_first_char_long_strings<O: OffsetSizeTrait>() ->
GenericBinaryArray<O> {
+ let valid: &[u8] = b" ";
+ let mut invalid = vec![];
+ invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
+ invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
+ GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(&invalid)])
+}
+
+/// returns a BinaryArray with invalid UTF8 data in a character other than
+/// the first (this is checked in a special codepath)
+fn invalid_utf8_later_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
+ let valid: &[u8] = b" ";
+ let invalid: &[u8] = INVALID_UTF8_LATER_CHAR;
+ GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(invalid)])
+}
+
+/// returns a BinaryArray with invalid UTF8 data in a character other than
+/// the first in a string larger than 12 bytes which is handled specially
+/// when reading `ByteViewArray`s (this is checked in a special codepath)
+fn invalid_utf8_later_char_long_strings<O: OffsetSizeTrait>() ->
GenericBinaryArray<O> {
+ let valid: &[u8] = b" ";
+ let mut invalid = vec![];
+ invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
+ invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
+ GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(&invalid)])
+}
+
+/// returns a BinaryArray with invalid UTF8 data in a character other than
+/// the first in a string larger than 128 bytes which is handled specially
+/// when reading `ByteViewArray`s (this is checked in a special codepath)
+fn invalid_utf8_later_char_really_long_strings<O: OffsetSizeTrait>() ->
GenericBinaryArray<O> {
+ let valid: &[u8] = b" ";
+ let mut invalid = vec![];
+ for _ in 0..10 {
+ // each instance is 38 bytes
+ invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
+ }
+ invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
+ GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None,
Some(&invalid)])
+}
+
+/// returns a BinaryArray with small invalid UTF8 data followed by a large
+/// invalid UTF8 data in a character other than the first in a string larger
+fn invalid_utf8_later_char_really_long_strings2<O: OffsetSizeTrait>() ->
GenericBinaryArray<O> {
+ let valid: &[u8] = b" ";
+ let mut valid_long = vec![];
+ for _ in 0..10 {
+ // each instance is 38 bytes
+
valid_long.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
+ }
+ let invalid = INVALID_UTF8_LATER_CHAR;
+ GenericBinaryArray::<O>::from_iter(vec![
+ None,
+ Some(valid),
+ Some(invalid),
+ None,
+ Some(&valid_long),
+ Some(valid),
+ ])
+}
+
+/// writes the array into a single column parquet file with the specified
+/// encoding.
+///
+/// If no encoding is specified, use default (dictionary) encoding
+fn write_to_parquet_with_encoding(array: ArrayRef, encoding: Option<Encoding>)
-> Vec<u8> {
+ let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap();
+ let mut data = vec![];
+ let schema = batch.schema();
+ let props = encoding.map(|encoding| {
+ WriterProperties::builder()
+ // must disable dictionary encoding to actually use encoding
+ .set_dictionary_enabled(false)
+ .set_encoding(encoding)
+ .build()
+ });
+
+ {
+ let mut writer = ArrowWriter::try_new(&mut data, schema,
props).unwrap();
+ writer.write(&batch).unwrap();
+ writer.flush().unwrap();
+ writer.close().unwrap();
+ };
+ data
+}
+
+/// read the parquet file into a record batch
+fn read_from_parquet(data: Vec<u8>) -> Result<Vec<RecordBatch>, ArrowError> {
+ let reader = ArrowReaderBuilder::try_new(bytes::Bytes::from(data))
+ .unwrap()
+ .build()
+ .unwrap();
+
+ reader.collect()
+}
diff --git a/parquet/tests/arrow_reader/mod.rs
b/parquet/tests/arrow_reader/mod.rs
index ffc36655b3..3b8f71ccbe 100644
--- a/parquet/tests/arrow_reader/mod.rs
+++ b/parquet/tests/arrow_reader/mod.rs
@@ -43,6 +43,7 @@ mod bloom_filter;
#[cfg(feature = "crc")]
mod checksum;
mod int96_stats_roundtrip;
+mod invalid_utf8;
mod io;
#[cfg(feature = "async")]
mod predicate_cache;