Re: [PR] Add `ArrowToParquetSchemaConverter`, deprecate `arrow_to_parquet_schema` [arrow-rs]

via GitHub Thu, 05 Dec 2024 16:01:44 -0800


etseidl commented on code in PR #6840:
URL: https://github.com/apache/arrow-rs/pull/6840#discussion_r1872335773



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -1487,7 +1590,10 @@ mod tests {
         ";
         let parquet_group_type = parse_message_type(message_type).unwrap();
         let parquet_schema = 
SchemaDescriptor::new(Arc::new(parquet_group_type));
-        let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, 
true).unwrap();
+        let converted_arrow_schema = 
ArrowToParquetSchemaConverter::new(&arrow_schema)
+            .with_coerce_types(true)

Review Comment:
   I love how this makes what's going on more explicit.



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -225,27 +225,130 @@ pub(crate) fn 
add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
     }
 }
 
-/// Convert arrow schema to parquet schema
+/// Converter for arrow schema to parquet schema
 ///
-/// The name of the root schema element defaults to `"arrow_schema"`, this can 
be
-/// overridden with [`arrow_to_parquet_schema_with_root`]
-pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> 
Result<SchemaDescriptor> {
-    arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types)
+/// Example:
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow_schema::{Field, Schema, DataType};
+/// # use parquet::arrow::ArrowToParquetSchemaConverter;
+/// use parquet::schema::types::{SchemaDescriptor, Type};
+/// use parquet::basic; // note there are two `Type`s in the following example
+/// let arrow_schema = Schema::new(vec![
+///   Field::new("a", DataType::Int64, true),
+///   Field::new("b", DataType::Date32, true),
+/// ]);
+///
+/// let parquet_schema = ArrowToParquetSchemaConverter::new(&arrow_schema)
+///   .build()
+///   .unwrap();
+/// //
+/// let expected_parquet_schema = SchemaDescriptor::new(
+///   Arc::new(
+///     Type::group_type_builder("arrow_schema")
+///       .with_fields(vec![
+///         Arc::new(
+///          Type::primitive_type_builder("a", basic::Type::INT64)
+///           .build().unwrap()
+///         ),
+///         Arc::new(
+///          Type::primitive_type_builder("b", basic::Type::INT32)
+///           .with_converted_type(basic::ConvertedType::DATE)
+///           .with_logical_type(Some(basic::LogicalType::Date))
+///           .build().unwrap()
+///         ),
+///      ])
+///      .build().unwrap()
+///   )
+/// );
+///
+/// assert_eq!(parquet_schema, expected_parquet_schema);
+/// ```
+#[derive(Debug)]
+pub struct ArrowToParquetSchemaConverter<'a> {
+    /// The schema to convert
+    schema: &'a Schema,
+    /// Name of the root schema in Parquet
+    schema_root: &'a str,
+    /// Should we Coerce arrow types to compatible Parquet types?
+    ///
+    /// See docs on [Self::with_coerce_types]`
+    coerce_types: bool,
 }
 
-/// Convert arrow schema to parquet schema specifying the name of the root 
schema element
-pub fn arrow_to_parquet_schema_with_root(
-    schema: &Schema,
-    root: &str,
-    coerce_types: bool,
-) -> Result<SchemaDescriptor> {
-    let fields = schema
-        .fields()
-        .iter()
-        .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new))
-        .collect::<Result<_>>()?;
-    let group = Type::group_type_builder(root).with_fields(fields).build()?;
-    Ok(SchemaDescriptor::new(Arc::new(group)))
+impl<'a> ArrowToParquetSchemaConverter<'a> {
+    /// Create a new converter
+    pub fn new(schema: &'a Schema) -> Self {
+        Self {
+            schema,
+            schema_root: "arrow_schema",
+            coerce_types: false,
+        }
+    }
+
+    /// Should arrow types be coerced into parquet native types (default 
false).
+    ///
+    /// Setting this option to `true` will result in parquet files that can be
+    /// read by more readers, but may lose precision for arrow types such as
+    /// [`DataType::Date64`] which have no direct corresponding Parquet type.
+    ///
+    /// By default, does not coerce to native parquet types. Enabling type
+    /// coercion allows for meaningful representations that do not require
+    /// downstream readers to consider the embedded Arrow schema, and can allow
+    /// for greater compatibility with other Parquet implementations. However,
+    /// type coercion also prevents data from being losslessly round-tripped.
+    ///
+    /// # Discussion
+    ///
+    /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no
+    /// corresponding Parquet logical type. Thus, they can not be losslessly
+    /// round-tripped when stored using the appropriate Parquet logical type.
+    ///
+    /// For example, some Date64 values may be truncated when stored with
+    /// parquet's native 32 bit date type. For [`List`] and [`Map`] types, some
+    /// Parquet readers expect certain schema elements to have specific names
+    /// (earlier versions of the spec was somewhat ambiguous on this point).

Review Comment:
   ```suggestion
       /// round-tripped when stored using the appropriate Parquet logical type.
       /// For example, some Date64 values may be truncated when stored with
       /// parquet's native 32 bit date type.
       ///
       /// For [`List`] and [`Map`] types, some
       /// Parquet readers expect certain schema elements to have specific names
       /// (earlier versions of the spec were somewhat ambiguous on this point).
       /// Type coercion will use the names prescribed by the Parquet 
specification,
       /// potentially losing naming metadata from the Arrow schema.
   ```
   
   Just trying to make clear what the naming impacts are...not sure if this is 
an improvement.



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -225,27 +225,130 @@ pub(crate) fn 
add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
     }
 }
 
-/// Convert arrow schema to parquet schema
+/// Converter for arrow schema to parquet schema

Review Comment:
   The docs are pretty inconsistent on whether "arrow" and "parquet" are 
capitalized or not. Do you have a preference?



##########
parquet/src/file/properties.rs:
##########
@@ -780,22 +779,16 @@ impl WriterPropertiesBuilder {
         self
     }
 
-    /// Sets flag to control if type coercion is enabled (defaults to `false`).
+    /// Should the writer coerce types to parquet native types (defaults to 
`false`).
     ///
-    /// # Notes
-    /// Some Arrow types do not have a corresponding Parquet logical type.
-    /// Affected Arrow data types include `Date64`, `Timestamp` and `Interval`.
-    /// Also, for [`List`] and [`Map`] types, Parquet expects certain schema 
elements
-    /// to have specific names to be considered fully compliant.
-    /// Writers have the option to coerce these types and names to match those 
required
-    /// by the Parquet specification.
-    /// This type coercion allows for meaningful representations that do not 
require
-    /// downstream readers to consider the embedded Arrow schema, and can 
allow for greater
-    /// compatibility with other Parquet implementations. However, type
-    /// coercion also prevents the data from being losslessly round-tripped.
-    ///
-    /// [`List`]: 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
-    /// [`Map`]: 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
+    /// Setting this option to `true` will result in parquet files that can be
+    /// read by more readers, but may lose precision for arrow types such as
+    /// [`DataType::Date64`] which have no direct corresponding Parquet type.

Review Comment:
   I think we should still mention the naming impacts here. There seem to be a 
number of users who want Parquet naming, and I think this is the first place 
they'd look to figure out how to do so. I'm not sure I'd think to follow the 
link to the converter when I want my lists named properly.



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -225,27 +225,130 @@ pub(crate) fn 
add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
     }
 }
 
-/// Convert arrow schema to parquet schema
+/// Converter for arrow schema to parquet schema
 ///
-/// The name of the root schema element defaults to `"arrow_schema"`, this can 
be
-/// overridden with [`arrow_to_parquet_schema_with_root`]
-pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> 
Result<SchemaDescriptor> {
-    arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types)
+/// Example:
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow_schema::{Field, Schema, DataType};
+/// # use parquet::arrow::ArrowToParquetSchemaConverter;
+/// use parquet::schema::types::{SchemaDescriptor, Type};
+/// use parquet::basic; // note there are two `Type`s in the following example
+/// let arrow_schema = Schema::new(vec![
+///   Field::new("a", DataType::Int64, true),
+///   Field::new("b", DataType::Date32, true),
+/// ]);
+///
+/// let parquet_schema = ArrowToParquetSchemaConverter::new(&arrow_schema)
+///   .build()
+///   .unwrap();
+/// //
+/// let expected_parquet_schema = SchemaDescriptor::new(
+///   Arc::new(
+///     Type::group_type_builder("arrow_schema")
+///       .with_fields(vec![
+///         Arc::new(
+///          Type::primitive_type_builder("a", basic::Type::INT64)
+///           .build().unwrap()
+///         ),
+///         Arc::new(
+///          Type::primitive_type_builder("b", basic::Type::INT32)
+///           .with_converted_type(basic::ConvertedType::DATE)
+///           .with_logical_type(Some(basic::LogicalType::Date))
+///           .build().unwrap()
+///         ),
+///      ])
+///      .build().unwrap()
+///   )
+/// );
+///
+/// assert_eq!(parquet_schema, expected_parquet_schema);
+/// ```
+#[derive(Debug)]
+pub struct ArrowToParquetSchemaConverter<'a> {
+    /// The schema to convert
+    schema: &'a Schema,
+    /// Name of the root schema in Parquet
+    schema_root: &'a str,
+    /// Should we Coerce arrow types to compatible Parquet types?
+    ///
+    /// See docs on [Self::with_coerce_types]`
+    coerce_types: bool,
 }
 
-/// Convert arrow schema to parquet schema specifying the name of the root 
schema element
-pub fn arrow_to_parquet_schema_with_root(
-    schema: &Schema,
-    root: &str,
-    coerce_types: bool,
-) -> Result<SchemaDescriptor> {
-    let fields = schema
-        .fields()
-        .iter()
-        .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new))
-        .collect::<Result<_>>()?;
-    let group = Type::group_type_builder(root).with_fields(fields).build()?;
-    Ok(SchemaDescriptor::new(Arc::new(group)))
+impl<'a> ArrowToParquetSchemaConverter<'a> {
+    /// Create a new converter
+    pub fn new(schema: &'a Schema) -> Self {
+        Self {
+            schema,
+            schema_root: "arrow_schema",
+            coerce_types: false,
+        }
+    }
+
+    /// Should arrow types be coerced into parquet native types (default 
false).
+    ///
+    /// Setting this option to `true` will result in parquet files that can be
+    /// read by more readers, but may lose precision for arrow types such as
+    /// [`DataType::Date64`] which have no direct corresponding Parquet type.
+    ///
+    /// By default, does not coerce to native parquet types. Enabling type

Review Comment:
   ```suggestion
       /// By default, this converter does not coerce to native parquet types. 
Enabling type
   ```



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -225,27 +225,130 @@ pub(crate) fn 
add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
     }
 }
 
-/// Convert arrow schema to parquet schema
+/// Converter for arrow schema to parquet schema
 ///
-/// The name of the root schema element defaults to `"arrow_schema"`, this can 
be
-/// overridden with [`arrow_to_parquet_schema_with_root`]
-pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> 
Result<SchemaDescriptor> {
-    arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types)
+/// Example:
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow_schema::{Field, Schema, DataType};
+/// # use parquet::arrow::ArrowToParquetSchemaConverter;
+/// use parquet::schema::types::{SchemaDescriptor, Type};
+/// use parquet::basic; // note there are two `Type`s in the following example
+/// let arrow_schema = Schema::new(vec![
+///   Field::new("a", DataType::Int64, true),
+///   Field::new("b", DataType::Date32, true),
+/// ]);
+///
+/// let parquet_schema = ArrowToParquetSchemaConverter::new(&arrow_schema)
+///   .build()
+///   .unwrap();
+/// //
+/// let expected_parquet_schema = SchemaDescriptor::new(
+///   Arc::new(
+///     Type::group_type_builder("arrow_schema")
+///       .with_fields(vec![
+///         Arc::new(
+///          Type::primitive_type_builder("a", basic::Type::INT64)
+///           .build().unwrap()
+///         ),
+///         Arc::new(
+///          Type::primitive_type_builder("b", basic::Type::INT32)
+///           .with_converted_type(basic::ConvertedType::DATE)
+///           .with_logical_type(Some(basic::LogicalType::Date))
+///           .build().unwrap()
+///         ),
+///      ])
+///      .build().unwrap()
+///   )
+/// );
+///
+/// assert_eq!(parquet_schema, expected_parquet_schema);
+/// ```
+#[derive(Debug)]
+pub struct ArrowToParquetSchemaConverter<'a> {
+    /// The schema to convert
+    schema: &'a Schema,
+    /// Name of the root schema in Parquet
+    schema_root: &'a str,
+    /// Should we Coerce arrow types to compatible Parquet types?
+    ///
+    /// See docs on [Self::with_coerce_types]`
+    coerce_types: bool,
 }
 
-/// Convert arrow schema to parquet schema specifying the name of the root 
schema element
-pub fn arrow_to_parquet_schema_with_root(
-    schema: &Schema,
-    root: &str,
-    coerce_types: bool,
-) -> Result<SchemaDescriptor> {
-    let fields = schema
-        .fields()
-        .iter()
-        .map(|field| arrow_to_parquet_type(field, coerce_types).map(Arc::new))
-        .collect::<Result<_>>()?;
-    let group = Type::group_type_builder(root).with_fields(fields).build()?;
-    Ok(SchemaDescriptor::new(Arc::new(group)))
+impl<'a> ArrowToParquetSchemaConverter<'a> {
+    /// Create a new converter
+    pub fn new(schema: &'a Schema) -> Self {
+        Self {
+            schema,
+            schema_root: "arrow_schema",
+            coerce_types: false,
+        }
+    }
+
+    /// Should arrow types be coerced into parquet native types (default 
false).

Review Comment:
   ```suggestion
       /// Should arrow types be coerced into parquet native types (default 
`false`).
   ```



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -225,27 +225,130 @@ pub(crate) fn 
add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
     }
 }
 
-/// Convert arrow schema to parquet schema
+/// Converter for arrow schema to parquet schema
 ///
-/// The name of the root schema element defaults to `"arrow_schema"`, this can 
be
-/// overridden with [`arrow_to_parquet_schema_with_root`]
-pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> 
Result<SchemaDescriptor> {
-    arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types)
+/// Example:
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow_schema::{Field, Schema, DataType};
+/// # use parquet::arrow::ArrowToParquetSchemaConverter;
+/// use parquet::schema::types::{SchemaDescriptor, Type};
+/// use parquet::basic; // note there are two `Type`s in the following example
+/// let arrow_schema = Schema::new(vec![
+///   Field::new("a", DataType::Int64, true),
+///   Field::new("b", DataType::Date32, true),
+/// ]);
+///
+/// let parquet_schema = ArrowToParquetSchemaConverter::new(&arrow_schema)
+///   .build()
+///   .unwrap();
+/// //

Review Comment:
   ```suggestion
   ///
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add `ArrowToParquetSchemaConverter`, deprecate `arrow_to_parquet_schema` [arrow-rs]

Reply via email to