alamb commented on code in PR #6023:
URL: https://github.com/apache/arrow-rs/pull/6023#discussion_r1669361838
##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -2429,25 +2429,39 @@ mod tests {
invalid_utf8_first_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8
data",
),
+ (
+ invalid_utf8_first_char_long_strings::<O>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 41",
+ ),
(
invalid_utf8_later_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 6",
),
+ (
+ invalid_utf8_later_char_long_strings::<O>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 44",
+ ),
+ (
+ invalid_utf8_later_char_really_long_strings::<O>(),
+ "Parquet argument error: Parquet error: encountered non UTF-8
data: invalid utf-8 sequence of 1 bytes from index 386",
+ ),
];
- for (array, expected_error) in cases {
- // data is not valid utf8 we can not construct a correct
StringArray
- // safely, so purposely create an invalid StringArray
- let array = unsafe {
- GenericStringArray::<O>::new_unchecked(
- array.offsets().clone(),
- array.values().clone(),
- array.nulls().cloned(),
- )
- };
- let data_type = array.data_type().clone();
- let data = write_to_parquet(Arc::new(array));
- let err = read_from_parquet(data).unwrap_err();
- assert_eq!(err.to_string(), expected_error, "data type:
{data_type:?}")
+ for (array, expected_error) in &cases {
+ for encoding in STRING_ENCODINGS {
Review Comment:
Needed the invalid_utf8_later_char_really_long_strings2 test and to setup
the encoder correctly
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]