alamb commented on code in PR #6023:
URL: https://github.com/apache/arrow-rs/pull/6023#discussion_r1669326160
##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -2429,25 +2429,39 @@ mod tests {
invalid_utf8_first_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8
data",
),
+ (
+ invalid_utf8_first_char_long_strings::<O>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 41",
+ ),
(
invalid_utf8_later_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 6",
),
+ (
+ invalid_utf8_later_char_long_strings::<O>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 44",
+ ),
+ (
+ invalid_utf8_later_char_really_long_strings::<O>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 386",
+ ),
];
- for (array, expected_error) in cases {
- // data is not valid utf8 we can not construct a correct
StringArray
- // safely, so purposely create an invalid StringArray
- let array = unsafe {
- GenericStringArray::<O>::new_unchecked(
- array.offsets().clone(),
- array.values().clone(),
- array.nulls().cloned(),
- )
- };
- let data_type = array.data_type().clone();
- let data = write_to_parquet(Arc::new(array));
- let err = read_from_parquet(data).unwrap_err();
- assert_eq!(err.to_string(), expected_error, "data type:
{data_type:?}")
+ for (array, expected_error) in &cases {
+ for encoding in STRING_ENCODINGS {
Review Comment:
something isn't quite right here as I tried commenting out the utf8 checks
in the parquet readers and these tests still pass
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]