This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new d6a29ec103 Support reading/writing `VariantArray` to parquet with 
Variant LogicalType (#8408)
d6a29ec103 is described below

commit d6a29ec1033b133add2a94d5fb62bb450c9519ff
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Sep 26 09:20:54 2025 -0700

    Support reading/writing `VariantArray` to parquet with Variant LogicalType 
(#8408)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/8370
    - Closes https://github.com/apache/arrow-rs/pull/8365
    
    # Rationale for this change
    
    Parquet has logical types, which is how other writers signal what
    columns contain `Variant` values.
    
    # What changes are included in this PR?
    
    1. Add mapping from Parquet LogicalType to/from Arrow ExtensionType
    added in https://github.com/apache/arrow-rs/pull/8392
    2. Documentation and tests showing reading/writing Parquet files with
    the Variant logical annotation
    
    # Are these changes tested?
    
    Yes, new unit tests and doc examples
    
    # Are there any user-facing changes?
    You can now read/write Variant columns to `VariantArray`
    
    ---------
    
    Co-authored-by: Matthijs Brobbel <[email protected]>
---
 parquet-variant-compute/Cargo.toml    |   2 +-
 parquet/Cargo.toml                    |   2 +-
 parquet/src/arrow/schema/complex.rs   |  29 ++--
 parquet/src/arrow/schema/extension.rs |  72 +++++++++
 parquet/src/arrow/schema/mod.rs       |  12 +-
 parquet/src/variant.rs                | 269 +++++++++++++++++++++++++++++-----
 6 files changed, 330 insertions(+), 56 deletions(-)

diff --git a/parquet-variant-compute/Cargo.toml 
b/parquet-variant-compute/Cargo.toml
index 64ab195a52..828ad77bd6 100644
--- a/parquet-variant-compute/Cargo.toml
+++ b/parquet-variant-compute/Cargo.toml
@@ -31,7 +31,7 @@ rust-version = { workspace = true }
 
 
 [dependencies]
-arrow = { workspace = true }
+arrow = { workspace = true , features = ["canonical_extension_types"]}
 arrow-schema = { workspace = true }
 half = { version = "2.1", default-features = false }
 indexmap = "2.10.0"
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index f57a7627a5..06e6aac2e3 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -130,7 +130,7 @@ encryption = ["dep:ring"]
 flate2-rust_backened = ["flate2/rust_backend"]
 flate2-zlib-rs = ["flate2/zlib-rs"]
 # Enable parquet variant support
-variant_experimental = ["parquet-variant", "parquet-variant-json", 
"parquet-variant-compute"]
+variant_experimental = ["arrow", "parquet-variant", "parquet-variant-json", 
"parquet-variant-compute"]
 
 
 [[example]]
diff --git a/parquet/src/arrow/schema/complex.rs 
b/parquet/src/arrow/schema/complex.rs
index 16d46bd852..ecc80a6590 100644
--- a/parquet/src/arrow/schema/complex.rs
+++ b/parquet/src/arrow/schema/complex.rs
@@ -18,6 +18,7 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 
+use crate::arrow::schema::extension::add_extension_type;
 use crate::arrow::schema::primitive::convert_primitive;
 use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY};
 use crate::basic::{ConvertedType, Repetition};
@@ -172,7 +173,7 @@ impl Visitor {
 
         let parquet_fields = struct_type.get_fields();
 
-        // Extract the arrow fields
+        // Extract any arrow fields from the hints
         let arrow_fields = match &context.data_type {
             Some(DataType::Struct(fields)) => {
                 if fields.len() != parquet_fields.len() {
@@ -220,10 +221,10 @@ impl Visitor {
                 data_type,
             };
 
-            if let Some(child) = self.dispatch(parquet_field, child_ctx)? {
+            if let Some(mut child) = self.dispatch(parquet_field, child_ctx)? {
                 // The child type returned may be different from what is 
encoded in the arrow
                 // schema in the event of a mismatch or a projection
-                child_fields.push(convert_field(parquet_field, &child, 
arrow_field));
+                child_fields.push(convert_field(parquet_field, &mut child, 
arrow_field));
                 children.push(child);
             }
         }
@@ -352,13 +353,13 @@ impl Visitor {
 
         // Need both columns to be projected
         match (maybe_key, maybe_value) {
-            (Some(key), Some(value)) => {
+            (Some(mut key), Some(mut value)) => {
                 let key_field = Arc::new(
-                    convert_field(map_key, &key, arrow_key)
+                    convert_field(map_key, &mut key, arrow_key)
                         // The key is always non-nullable (#5630)
                         .with_nullable(false),
                 );
-                let value_field = Arc::new(convert_field(map_value, &value, 
arrow_value));
+                let value_field = Arc::new(convert_field(map_value, &mut 
value, arrow_value));
                 let field_metadata = match arrow_map {
                     Some(field) => field.metadata().clone(),
                     _ => HashMap::default(),
@@ -495,8 +496,8 @@ impl Visitor {
         };
 
         match self.dispatch(item_type, new_context) {
-            Ok(Some(item)) => {
-                let item_field = Arc::new(convert_field(item_type, &item, 
arrow_field));
+            Ok(Some(mut item)) => {
+                let item_field = Arc::new(convert_field(item_type, &mut item, 
arrow_field));
 
                 // Use arrow type as hint for index size
                 let arrow_type = match context.data_type {
@@ -540,11 +541,15 @@ impl Visitor {
     }
 }
 
-/// Computes the [`Field`] for a child column
+/// Computes the Arrow [`Field`] for a child column
 ///
-/// The resulting [`Field`] will have the type dictated by `field`, a name
+/// The resulting Arrow [`Field`] will have the type dictated by the Parquet 
`field`, a name
 /// dictated by the `parquet_type`, and any metadata from `arrow_hint`
-fn convert_field(parquet_type: &Type, field: &ParquetField, arrow_hint: 
Option<&Field>) -> Field {
+fn convert_field(
+    parquet_type: &Type,
+    field: &mut ParquetField,
+    arrow_hint: Option<&Field>,
+) -> Field {
     let name = parquet_type.name();
     let data_type = field.arrow_type.clone();
     let nullable = field.nullable;
@@ -575,7 +580,7 @@ fn convert_field(parquet_type: &Type, field: &ParquetField, 
arrow_hint: Option<&
                 );
                 ret.set_metadata(meta);
             }
-            ret
+            add_extension_type(ret, parquet_type)
         }
     }
 }
diff --git a/parquet/src/arrow/schema/extension.rs 
b/parquet/src/arrow/schema/extension.rs
new file mode 100644
index 0000000000..752b9a5ced
--- /dev/null
+++ b/parquet/src/arrow/schema/extension.rs
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Arrow Extension Type Support for Parquet
+//!
+//! This module contains mapping code to map Parquet [`LogicalType`]s to/from
+//! Arrow [`ExtensionType`]s.
+//!
+//! Extension types are represented using the metadata from Arrow [`Field`]s
+//! with the key "ARROW:extension:name".
+
+use crate::basic::LogicalType;
+use crate::schema::types::Type;
+use arrow_schema::extension::ExtensionType;
+use arrow_schema::Field;
+
+/// Adds extension type metadata, if necessary, based on the Parquet field's
+/// [`LogicalType`]
+///
+/// Some Parquet logical types, such as Variant, do not map directly to an
+/// Arrow DataType, and instead are represented by an Arrow ExtensionType.
+/// Extension types are attached to Arrow Fields via metadata.
+pub(crate) fn add_extension_type(mut arrow_field: Field, parquet_type: &Type) 
-> Field {
+    match parquet_type.get_basic_info().logical_type() {
+        #[cfg(feature = "variant_experimental")]
+        Some(LogicalType::Variant) => {
+            // try to add the Variant extension type, but if that fails (e.g. 
because the
+            // storage type is not supported), just return the field as is
+            arrow_field
+                .try_with_extension_type(parquet_variant_compute::VariantType)
+                .ok();
+            arrow_field
+        }
+        // TODO add other LogicalTypes here
+        _ => arrow_field,
+    }
+}
+
+/// Return the Parquet logical type to use for the specified Arrow field, if 
any.
+#[cfg(feature = "variant_experimental")]
+pub(crate) fn logical_type_for_struct(field: &Field) -> Option<LogicalType> {
+    use parquet_variant_compute::VariantType;
+    // Check the name (= quick and cheap) and only try_extension_type if the 
name matches
+    // to avoid unnecessary String allocations in ArrowError
+    if field.extension_type_name()? != VariantType::NAME {
+        return None;
+    }
+    match field.try_extension_type::<VariantType>() {
+        Ok(VariantType) => Some(LogicalType::Variant),
+        // Given check above, this should not error, but if it does ignore
+        Err(_e) => None,
+    }
+}
+
+#[cfg(not(feature = "variant_experimental"))]
+pub(crate) fn logical_type_for_struct(field: &Field) -> Option<LogicalType> {
+    None
+}
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 5b079b6627..9d1098d86c 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -35,13 +35,14 @@ use crate::file::{metadata::KeyValue, 
properties::WriterProperties};
 use crate::schema::types::{ColumnDescriptor, SchemaDescriptor, Type};
 
 mod complex;
+mod extension;
 mod primitive;
 
+use super::PARQUET_FIELD_ID_META_KEY;
+use crate::arrow::schema::extension::logical_type_for_struct;
 use crate::arrow::ProjectionMask;
 pub(crate) use complex::{ParquetField, ParquetFieldType};
 
-use super::PARQUET_FIELD_ID_META_KEY;
-
 /// Convert Parquet schema to Arrow schema including optional metadata
 ///
 /// Attempts to decode any existing Arrow schema metadata, falling back
@@ -63,7 +64,11 @@ pub fn parquet_to_arrow_schema_by_columns(
     Ok(parquet_to_arrow_schema_and_fields(parquet_schema, mask, 
key_value_metadata)?.0)
 }
 
-/// Extracts the arrow metadata
+/// Determines the Arrow Schema from a Parquet schema
+///
+/// Looks for an Arrow schema metadata "hint" (see
+/// [`parquet_to_arrow_field_levels`]), and uses it if present to ensure
+/// lossless round trips.
 pub(crate) fn parquet_to_arrow_schema_and_fields(
     parquet_schema: &SchemaDescriptor,
     mask: ProjectionMask,
@@ -728,6 +733,7 @@ fn arrow_to_parquet_type(field: &Field, coerce_types: bool) 
-> Result<Type> {
                 .with_fields(fields)
                 .with_repetition(repetition)
                 .with_id(id)
+                .with_logical_type(logical_type_for_struct(field))
                 .build()
         }
         DataType::Map(field, _) => {
diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs
index b5902c02ed..497d1dc6c4 100644
--- a/parquet/src/variant.rs
+++ b/parquet/src/variant.rs
@@ -25,38 +25,36 @@
 //! * [`Variant`] represents variant value, which can be an object, list, or 
primitive.
 //! * [`VariantBuilder`] for building `Variant` values.
 //! * [`VariantArray`] for representing a column of Variant values.
-//! * [`compute`] module with functions for manipulating Variants, such as
+//! * [`json_to_variant`] and [`variant_to_json`] for converting to/from JSON.
+//! * [`cast_to_variant()`] for casting other Arrow arrays to `VariantArray`.
+//! * [`VariantType`] Arrow ExtensionType for Parquet Variant logical type.
 //!   [`variant_get`] to extracting a value by path and functions to convert
 //!   between `Variant` and JSON.
 //!
-//! [Variant Logical Type]: Variant
-//! [`VariantArray`]: compute::VariantArray
-//! [`variant_get`]: compute::variant_get
-//!
 //! # Example: Writing a Parquet file with Variant column
 //! ```rust
-//! # use parquet::variant::compute::{VariantArray, VariantArrayBuilder};
-//! # use parquet::variant::VariantBuilderExt;
+//! # use parquet::variant::{VariantArray, VariantType, VariantArrayBuilder, 
VariantBuilderExt};
 //! # use std::sync::Arc;
-//! # use arrow_array::{ArrayRef, RecordBatch};
+//! # use arrow_array::{Array, ArrayRef, RecordBatch};
+//! # use arrow_schema::{DataType, Field, Schema};
 //! # use parquet::arrow::ArrowWriter;
 //! # fn main() -> Result<(), parquet::errors::ParquetError> {
 //!  // Use the VariantArrayBuilder to build a VariantArray
 //!  let mut builder = VariantArrayBuilder::new(3);
-//!  // row 1: {"name": "Alice"}
-//!  builder.new_object().with_field("name", "Alice").finish();
+//!  builder.new_object().with_field("name", "Alice").finish(); // row 1: 
{"name": "Alice"}
+//!  builder.append_value("such wow"); // row 2: "such wow" (a string)
 //!  let array = builder.build();
 //!
-//! // TODO support writing VariantArray directly
-//! // at the moment it panics when trying to downcast to a struct array
-//! // https://github.com/apache/arrow-rs/issues/8296
-//! //  let array: ArrayRef = Arc::new(array);
-//! let array: ArrayRef = Arc::new(array.into_inner());
-//!
+//!  // Since VariantArray is an ExtensionType, it needs to be converted
+//!  // to an ArrayRef and Field with the appropriate metadata
+//!  // before it can be written to a Parquet file
+//!  let field = array.field("data");
+//!  let array = ArrayRef::from(array);
 //!  // create a RecordBatch with the VariantArray
-//!  let batch = RecordBatch::try_from_iter(vec![("data", array)])?;
+//!  let schema = Schema::new(vec![field]);
+//!  let batch = RecordBatch::try_new(Arc::new(schema), vec![array])?;
 //!
-//!  // write the RecordBatch to a Parquet file
+//!  // Now you can write the RecordBatch to the Parquet file, as normal
 //!  let file = std::fs::File::create("variant.parquet")?;
 //!  let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?;
 //!  writer.write(&batch)?;
@@ -67,37 +65,29 @@
 //! # }
 //! ```
 //!
-//! # Example: Writing JSON with a Parquet file with Variant column
+//! # Example: Writing JSON into a Parquet file with Variant column
 //! ```rust
 //! # use std::sync::Arc;
 //! # use arrow_array::{ArrayRef, RecordBatch, StringArray};
-//! # use parquet::variant::compute::json_to_variant;
-//! # use parquet::variant::compute::VariantArray;
+//! # use arrow_schema::Schema;
+//! # use parquet::variant::{json_to_variant, VariantArray};
 //! # use parquet::arrow::ArrowWriter;
 //! # fn main() -> Result<(), parquet::errors::ParquetError> {
-//! // Create an array of JSON strings, simulating a column of JSON data
-//! // TODO use StringViewArray when available
-//! let input_array = StringArray::from(vec![
+//!  // Create an array of JSON strings, simulating a column of JSON data
+//!  let input_array: ArrayRef = Arc::new(StringArray::from(vec![
 //!   Some(r#"{"name": "Alice", "age": 30}"#),
 //!   Some(r#"{"name": "Bob", "age": 25, "address": {"city": "New York"}}"#),
 //!   None,
 //!   Some("{}"),
-//! ]);
-//! let input_array: ArrayRef = Arc::new(input_array);
-//!
-//! // Convert the JSON strings to a VariantArray
-//! let array: VariantArray = json_to_variant(&input_array)?;
-//!
-//! // TODO support writing VariantArray directly
-//! // at the moment it panics when trying to downcast to a struct array
-//! // https://github.com/apache/arrow-rs/issues/8296
-//! //  let array: ArrayRef = Arc::new(array);
-//! let array: ArrayRef = Arc::new(array.into_inner());
+//!  ]));
 //!
+//!  // Convert the JSON strings to a VariantArray
+//!  let array: VariantArray = json_to_variant(&input_array)?;
 //!  // create a RecordBatch with the VariantArray
-//!  let batch = RecordBatch::try_from_iter(vec![("data", array)])?;
+//!  let schema = Schema::new(vec![array.field("data")]);
+//!  let batch = RecordBatch::try_new(Arc::new(schema), 
vec![ArrayRef::from(array)])?;
 //!
-//!  // write the RecordBatch to a Parquet file
+//!  // write the RecordBatch to a Parquet file as normal
 //!  let file = std::fs::File::create("variant-json.parquet")?;
 //!  let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?;
 //!  writer.write(&batch)?;
@@ -108,6 +98,207 @@
 //! ```
 //!
 //! # Example: Reading a Parquet file with Variant column
-//! (TODO: add example)
+//!
+//! Use the [`VariantType`] extension type to find the Variant column:
+//!
+//! ```
+//! # use std::sync::Arc;
+//! # use std::path::PathBuf;
+//! # use arrow_array::{ArrayRef, RecordBatch, RecordBatchReader};
+//! # use parquet::variant::{Variant, VariantArray, VariantType};
+//! # use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+//! # fn main() -> Result<(), parquet::errors::ParquetError> {
+//! # use arrow_array::StructArray;
+//! # fn file_path() -> PathBuf { // return a testing file path
+//! #    PathBuf::from(arrow::util::test_util::parquet_test_data())
+//! #   .join("..")
+//! #   .join("shredded_variant")
+//! #   .join("case-075.parquet")
+//! # }
+//! // Read the Parquet file using standard Arrow Parquet reader.
+//! // Note this file has 2 columns: "id", "var", and the "var" column
+//  // contains a variant that looks like this:
+//  // "Variant(metadata=VariantMetadata(dict={}), value=Variant(type=STRING, 
value=iceberg))"
+//! let file = std::fs::File::open(file_path())?;
+//! let mut reader = ArrowReaderBuilder::try_new(file)?.build()?;
+//!
+//! // You can check if a column contains a Variant using
+//! // the VariantType extension type
+//! let schema = reader.schema();
+//! let field = schema.field_with_name("var")?;
+//! assert!(field.try_extension_type::<VariantType>().is_ok());
+//!
+//! // The reader will yield RecordBatches with a StructArray
+//! // to convert them to VariantArray, use VariantArray::try_new
+//! let batch = reader.next().unwrap().unwrap();
+//!
+//! let col = batch.column_by_name("var").unwrap();
+//! let var_array = VariantArray::try_new(col)?;
+//! assert_eq!(var_array.len(), 1);
+//! let var_value: Variant = var_array.value(0);
+//! assert_eq!(var_value, Variant::from("iceberg")); // the value in 
case-075.parquet
+//! # Ok(())
+//! # }
+//! ```
 pub use parquet_variant::*;
-pub use parquet_variant_compute as compute;
+pub use parquet_variant_compute::*;
+
+#[cfg(test)]
+mod tests {
+    use crate::arrow::arrow_reader::ArrowReaderBuilder;
+    use crate::arrow::ArrowWriter;
+    use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
+    use crate::file::reader::ChunkReader;
+    use arrow::util::test_util::parquet_test_data;
+    use arrow_array::{ArrayRef, RecordBatch};
+    use arrow_schema::Schema;
+    use bytes::Bytes;
+    use parquet_variant::{Variant, VariantBuilderExt};
+    use parquet_variant_compute::{VariantArray, VariantArrayBuilder, 
VariantType};
+    use std::path::PathBuf;
+    use std::sync::Arc;
+
+    #[test]
+    fn roundtrip_basic() {
+        roundtrip(variant_array());
+    }
+
+    /// Ensure a file with Variant LogicalType, written by another writer in
+    /// parquet-testing, can be read as a VariantArray
+    #[test]
+    fn read_logical_type() {
+        // Note: case-075 2 columns ("id", "var")
+        // The variant looks like this:
+        // "Variant(metadata=VariantMetadata(dict={}), 
value=Variant(type=STRING, value=iceberg))"
+        let batch = read_shredded_variant_test_case("case-075.parquet");
+
+        assert_variant_metadata(&batch, "var");
+        let var_column = batch.column_by_name("var").expect("expected var 
column");
+        let var_array =
+            VariantArray::try_new(&var_column).expect("expected var column to 
be a VariantArray");
+
+        // verify the value
+        assert_eq!(var_array.len(), 1);
+        assert!(var_array.is_valid(0));
+        let var_value = var_array.value(0);
+        assert_eq!(var_value, Variant::from("iceberg"));
+    }
+
+    /// Writes a variant to a parquet file and ensures the parquet logical type
+    /// annotation is correct
+    #[test]
+    fn write_logical_type() {
+        let array = variant_array();
+        let batch = variant_array_to_batch(array);
+        let buffer = write_to_buffer(&batch);
+
+        // read the parquet file's metadata and verify the logical type
+        let metadata = read_metadata(&Bytes::from(buffer));
+        let schema = metadata.file_metadata().schema_descr();
+        let fields = schema.root_schema().get_fields();
+        assert_eq!(fields.len(), 1);
+        let field = &fields[0];
+        assert_eq!(field.name(), "data");
+        // data should have been written with the Variant logical type
+        assert_eq!(
+            field.get_basic_info().logical_type(),
+            Some(crate::basic::LogicalType::Variant)
+        );
+    }
+
+    /// Return a VariantArray with 3 rows:
+    ///
+    /// 1. `{"name": "Alice"}`
+    /// 2. `"such wow"` (a string)
+    /// 3. `null`
+    fn variant_array() -> VariantArray {
+        let mut builder = VariantArrayBuilder::new(3);
+        // row 1: {"name": "Alice"}
+        builder.new_object().with_field("name", "Alice").finish();
+        // row 2: "such wow" (a string)
+        builder.append_value("such wow");
+        // row 3: null
+        builder.append_null();
+        builder.build()
+    }
+
+    /// Writes a VariantArray to a parquet file and reads it back, verifying 
that
+    /// the data is the same
+    fn roundtrip(array: VariantArray) {
+        let source_batch = variant_array_to_batch(array);
+        assert_variant_metadata(&source_batch, "data");
+
+        let buffer = write_to_buffer(&source_batch);
+        let result_batch = read_to_batch(Bytes::from(buffer));
+        assert_variant_metadata(&result_batch, "data");
+        assert_eq!(result_batch, source_batch); // NB this also checks the 
schemas
+    }
+
+    /// creates a RecordBatch with a single column "data" from a VariantArray,
+    fn variant_array_to_batch(array: VariantArray) -> RecordBatch {
+        let field = array.field("data");
+        let schema = Schema::new(vec![field]);
+        RecordBatch::try_new(Arc::new(schema), 
vec![ArrayRef::from(array)]).unwrap()
+    }
+
+    /// writes a RecordBatch to memory buffer and returns the buffer
+    fn write_to_buffer(batch: &RecordBatch) -> Vec<u8> {
+        let mut buffer = vec![];
+        let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), 
None).unwrap();
+        writer.write(batch).unwrap();
+        writer.close().unwrap();
+        buffer
+    }
+
+    /// Reads the Parquet metadata
+    fn read_metadata<T: ChunkReader + 'static>(input: &T) -> ParquetMetaData {
+        let mut reader = ParquetMetaDataReader::new();
+        reader.try_parse(input).unwrap();
+        reader.finish().unwrap()
+    }
+
+    /// Reads a RecordBatch from a reader (e.g. Vec or File)
+    fn read_to_batch<T: ChunkReader + 'static>(reader: T) -> RecordBatch {
+        let reader = ArrowReaderBuilder::try_new(reader)
+            .unwrap()
+            .build()
+            .unwrap();
+        let mut batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, 
_>>().unwrap();
+        assert_eq!(batches.len(), 1);
+        batches.swap_remove(0)
+    }
+
+    /// Verifies the variant metadata is present in the schema for the 
specified
+    /// field name.
+    fn assert_variant_metadata(batch: &RecordBatch, field_name: &str) {
+        let schema = batch.schema();
+        let field = schema
+            .field_with_name(field_name)
+            .expect("could not find expected field");
+
+        // explicitly check the metadata so it is clear in the tests what the
+        // names are
+        let metadata_value = field
+            .metadata()
+            .get("ARROW:extension:name")
+            .expect("metadata does not exist");
+
+        assert_eq!(metadata_value, "arrow.parquet.variant");
+
+        // verify that `VariantType` also correctly finds the metadata
+        field
+            .try_extension_type::<VariantType>()
+            .expect("VariantExtensionType should be readable");
+    }
+
+    /// Read the specified test case filename from parquet-testing
+    /// See parquet-testing/shredded_variant/cases.json for more details
+    fn read_shredded_variant_test_case(name: &str) -> RecordBatch {
+        let case_file = PathBuf::from(parquet_test_data())
+            .join("..") // go up from data/ to parquet-testing/
+            .join("shredded_variant")
+            .join(name);
+        let case_file = std::fs::File::open(case_file).unwrap();
+        read_to_batch(case_file)
+    }
+}

Reply via email to