This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 89075a75b9 Allow to read parquet binary column as UTF8 type (#6539)
89075a75b9 is described below

commit 89075a75b937eb8630b2e3f7d3b7fc4b6e6aece7
Author: Jax Liu <[email protected]>
AuthorDate: Fri Oct 11 00:44:13 2024 +0800

    Allow to read parquet binary column as UTF8 type (#6539)
    
    * allow to apply hint for binary as Utf8 type
    
    * refactor tests
---
 parquet/src/arrow/arrow_reader/mod.rs | 102 ++++++++++++++++++++++++++++++++++
 parquet/src/arrow/schema/primitive.rs |   5 ++
 2 files changed, 107 insertions(+)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index a109851f72..d3709c03e9 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -3077,6 +3077,108 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_read_binary_as_utf8() {
+        let file = write_parquet_from_iter(vec![
+            (
+                "binary_to_utf8",
+                Arc::new(BinaryArray::from(vec![
+                    b"one".as_ref(),
+                    b"two".as_ref(),
+                    b"three".as_ref(),
+                ])) as ArrayRef,
+            ),
+            (
+                "large_binary_to_large_utf8",
+                Arc::new(LargeBinaryArray::from(vec![
+                    b"one".as_ref(),
+                    b"two".as_ref(),
+                    b"three".as_ref(),
+                ])) as ArrayRef,
+            ),
+            (
+                "binary_view_to_utf8_view",
+                Arc::new(BinaryViewArray::from(vec![
+                    b"one".as_ref(),
+                    b"two".as_ref(),
+                    b"three".as_ref(),
+                ])) as ArrayRef,
+            ),
+        ]);
+        let supplied_fields = Fields::from(vec![
+            Field::new("binary_to_utf8", ArrowDataType::Utf8, false),
+            Field::new(
+                "large_binary_to_large_utf8",
+                ArrowDataType::LargeUtf8,
+                false,
+            ),
+            Field::new("binary_view_to_utf8_view", ArrowDataType::Utf8View, 
false),
+        ]);
+
+        let options = 
ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields)));
+        let mut arrow_reader = 
ParquetRecordBatchReaderBuilder::try_new_with_options(
+            file.try_clone().unwrap(),
+            options,
+        )
+        .expect("reader builder with schema")
+        .build()
+        .expect("reader with schema");
+
+        let batch = arrow_reader.next().unwrap().unwrap();
+        assert_eq!(batch.num_columns(), 3);
+        assert_eq!(batch.num_rows(), 3);
+        assert_eq!(
+            batch
+                .column(0)
+                .as_string::<i32>()
+                .iter()
+                .collect::<Vec<_>>(),
+            vec![Some("one"), Some("two"), Some("three")]
+        );
+
+        assert_eq!(
+            batch
+                .column(1)
+                .as_string::<i64>()
+                .iter()
+                .collect::<Vec<_>>(),
+            vec![Some("one"), Some("two"), Some("three")]
+        );
+
+        assert_eq!(
+            batch.column(2).as_string_view().iter().collect::<Vec<_>>(),
+            vec![Some("one"), Some("two"), Some("three")]
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "Invalid UTF8 sequence at")]
+    fn test_read_non_utf8_binary_as_utf8() {
+        let file = write_parquet_from_iter(vec![(
+            "non_utf8_binary",
+            Arc::new(BinaryArray::from(vec![
+                b"\xDE\x00\xFF".as_ref(),
+                b"\xDE\x01\xAA".as_ref(),
+                b"\xDE\x02\xFF".as_ref(),
+            ])) as ArrayRef,
+        )]);
+        let supplied_fields = Fields::from(vec![Field::new(
+            "non_utf8_binary",
+            ArrowDataType::Utf8,
+            false,
+        )]);
+
+        let options = 
ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields)));
+        let mut arrow_reader = 
ParquetRecordBatchReaderBuilder::try_new_with_options(
+            file.try_clone().unwrap(),
+            options,
+        )
+        .expect("reader builder with schema")
+        .build()
+        .expect("reader with schema");
+        arrow_reader.next().unwrap().unwrap_err();
+    }
+
     #[test]
     fn test_with_schema() {
         let nested_fields = Fields::from(vec![
diff --git a/parquet/src/arrow/schema/primitive.rs 
b/parquet/src/arrow/schema/primitive.rs
index 17dd7862f3..9f215b4dc0 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -57,6 +57,11 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType 
{
         (DataType::Utf8, DataType::LargeUtf8) => hint,
         (DataType::Binary, DataType::LargeBinary) => hint,
 
+        // Read as Utf8
+        (DataType::Binary, DataType::Utf8) => hint,
+        (DataType::Binary, DataType::LargeUtf8) => hint,
+        (DataType::Binary, DataType::Utf8View) => hint,
+
         // Determine view type
         (DataType::Utf8, DataType::Utf8View) => hint,
         (DataType::Binary, DataType::BinaryView) => hint,

Reply via email to