This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 89075a75b9 Allow to read parquet binary column as UTF8 type (#6539)
89075a75b9 is described below
commit 89075a75b937eb8630b2e3f7d3b7fc4b6e6aece7
Author: Jax Liu <[email protected]>
AuthorDate: Fri Oct 11 00:44:13 2024 +0800
Allow to read parquet binary column as UTF8 type (#6539)
* allow to apply hint for binary as Utf8 type
* refactor tests
---
parquet/src/arrow/arrow_reader/mod.rs | 102 ++++++++++++++++++++++++++++++++++
parquet/src/arrow/schema/primitive.rs | 5 ++
2 files changed, 107 insertions(+)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index a109851f72..d3709c03e9 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -3077,6 +3077,108 @@ mod tests {
);
}
+ #[test]
+ fn test_read_binary_as_utf8() {
+ let file = write_parquet_from_iter(vec![
+ (
+ "binary_to_utf8",
+ Arc::new(BinaryArray::from(vec![
+ b"one".as_ref(),
+ b"two".as_ref(),
+ b"three".as_ref(),
+ ])) as ArrayRef,
+ ),
+ (
+ "large_binary_to_large_utf8",
+ Arc::new(LargeBinaryArray::from(vec![
+ b"one".as_ref(),
+ b"two".as_ref(),
+ b"three".as_ref(),
+ ])) as ArrayRef,
+ ),
+ (
+ "binary_view_to_utf8_view",
+ Arc::new(BinaryViewArray::from(vec![
+ b"one".as_ref(),
+ b"two".as_ref(),
+ b"three".as_ref(),
+ ])) as ArrayRef,
+ ),
+ ]);
+ let supplied_fields = Fields::from(vec![
+ Field::new("binary_to_utf8", ArrowDataType::Utf8, false),
+ Field::new(
+ "large_binary_to_large_utf8",
+ ArrowDataType::LargeUtf8,
+ false,
+ ),
+ Field::new("binary_view_to_utf8_view", ArrowDataType::Utf8View,
false),
+ ]);
+
+ let options =
ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields)));
+ let mut arrow_reader =
ParquetRecordBatchReaderBuilder::try_new_with_options(
+ file.try_clone().unwrap(),
+ options,
+ )
+ .expect("reader builder with schema")
+ .build()
+ .expect("reader with schema");
+
+ let batch = arrow_reader.next().unwrap().unwrap();
+ assert_eq!(batch.num_columns(), 3);
+ assert_eq!(batch.num_rows(), 3);
+ assert_eq!(
+ batch
+ .column(0)
+ .as_string::<i32>()
+ .iter()
+ .collect::<Vec<_>>(),
+ vec![Some("one"), Some("two"), Some("three")]
+ );
+
+ assert_eq!(
+ batch
+ .column(1)
+ .as_string::<i64>()
+ .iter()
+ .collect::<Vec<_>>(),
+ vec![Some("one"), Some("two"), Some("three")]
+ );
+
+ assert_eq!(
+ batch.column(2).as_string_view().iter().collect::<Vec<_>>(),
+ vec![Some("one"), Some("two"), Some("three")]
+ );
+ }
+
+ #[test]
+ #[should_panic(expected = "Invalid UTF8 sequence at")]
+ fn test_read_non_utf8_binary_as_utf8() {
+ let file = write_parquet_from_iter(vec![(
+ "non_utf8_binary",
+ Arc::new(BinaryArray::from(vec![
+ b"\xDE\x00\xFF".as_ref(),
+ b"\xDE\x01\xAA".as_ref(),
+ b"\xDE\x02\xFF".as_ref(),
+ ])) as ArrayRef,
+ )]);
+ let supplied_fields = Fields::from(vec![Field::new(
+ "non_utf8_binary",
+ ArrowDataType::Utf8,
+ false,
+ )]);
+
+ let options =
ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields)));
+ let mut arrow_reader =
ParquetRecordBatchReaderBuilder::try_new_with_options(
+ file.try_clone().unwrap(),
+ options,
+ )
+ .expect("reader builder with schema")
+ .build()
+ .expect("reader with schema");
+ arrow_reader.next().unwrap().unwrap_err();
+ }
+
#[test]
fn test_with_schema() {
let nested_fields = Fields::from(vec![
diff --git a/parquet/src/arrow/schema/primitive.rs
b/parquet/src/arrow/schema/primitive.rs
index 17dd7862f3..9f215b4dc0 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -57,6 +57,11 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType
{
(DataType::Utf8, DataType::LargeUtf8) => hint,
(DataType::Binary, DataType::LargeBinary) => hint,
+ // Read as Utf8
+ (DataType::Binary, DataType::Utf8) => hint,
+ (DataType::Binary, DataType::LargeUtf8) => hint,
+ (DataType::Binary, DataType::Utf8View) => hint,
+
// Determine view type
(DataType::Utf8, DataType::Utf8View) => hint,
(DataType::Binary, DataType::BinaryView) => hint,