Re: [PR] Add `ArrowToParquetSchemaConverter`, deprecate `arrow_to_parquet_schema` [arrow-rs]

via GitHub Thu, 05 Dec 2024 14:13:32 -0800


alamb commented on code in PR #6840:
URL: https://github.com/apache/arrow-rs/pull/6840#discussion_r1872225592



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -225,29 +225,99 @@ pub(crate) fn 
add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
     }
 }
 
+/// Converter for arrow schema to parquet schema
+///
+/// Example:
+/// ```
+/// # use arrow_schema::{Field, Schema, DataType};
+/// use parquet::arrow::ArrowToParquetSchemaConverter;
+/// let schema = Schema::new(vec![
+///   Field::new("a", DataType::Int64, false),
+///   Field::new("b", DataType::Date32, false),
+/// ];
+///
+/// let parquet_schema = ArrowToParquetSchemaConverter::new(&schema)
+///   .build()
+///   .unwrap();
+///
+///
+/// ```
+#[derive(Debug)]
+pub struct ArrowToParquetSchemaConverter<'a> {
+    /// The schema to convert
+    schema: &'a Schema,
+    /// Name of the root schema in Parquet
+    schema_root: &'a str,
+    /// Should we Coerce arrow types to compatible Parquet types?
+    ///
+    /// See docs on [Self::with_coerce_types]`
+    coerce_types: bool
+}
+
+impl <'a> ArrowToParquetSchemaConverter<'a> {
+    /// Create a new converter
+    pub fn new(schema: &'a Schema) -> Self {
+        Self {
+            schema,
+            schema_root: "arrow_schema",
+            coerce_types: false,
+        }
+    }
+
+    /// Should arrow types be coerced into parquet native types (default 
false).
+    ///
+    /// Setting this option to `true` will result in parquet files that can be
+    /// read by more readers, but may lose precision for arrow types such as
+    /// [`DataType::Date64`] which have no direct corresponding Parquet type.
+    ///
+    /// # Discussion
+    ///
+    /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no
+    /// corresponding Parquet logical type. Thus, they can not be losslessly
+    /// round-tripped when stored using the appropriate Parquet logical type.
+    ///
+    /// For example, some Date64 values may be truncated when stored with
+    /// parquet's native 32 bit date type.
+    ///
+    /// By default, the arrow writer does not coerce to native parquet types. 
It
+    /// writes data in such a way that it can be lossless round tripped.
+    /// However, this means downstream readers must be aware of and correctly
+    /// interpret the embedded Arrow schema.
+    pub fn with_coerce_types(mut self, coerce_types: bool) -> Self {
+        self.coerce_types = coerce_types;
+        self
+    }
+
+    /// Set the root schema element name (defaults to `"arrow_schema"`).
+    pub fn schema_root(mut self, schema_root: &'a str) -> Self {
+        self.schema_root = schema_root;
+        self
+    }
+
+    /// Build the desired parquet [`SchemaDescriptor`]
+    pub fn build(self) -> Result<SchemaDescriptor> {
+        let Self { schema, schema_root: root_schema_name, coerce_types } = 
self;
+        let fields = schema
+            .fields()
+            .iter()
+            .map(|field| arrow_to_parquet_type(field, 
coerce_types).map(Arc::new))
+            .collect::<Result<_>>()?;
+        let group = 
Type::group_type_builder(root_schema_name).with_fields(fields).build()?;
+        Ok(SchemaDescriptor::new(Arc::new(group)))
+    }
+}
+
 /// Convert arrow schema to parquet schema
 ///
 /// The name of the root schema element defaults to `"arrow_schema"`, this can 
be
 /// overridden with [`arrow_to_parquet_schema_with_root`]
-pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> 
Result<SchemaDescriptor> {
-    arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types)
-}
+#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` 
instead")]
+pub fn arrow_to_parquet_schema(schema: &Schema) -> Result<SchemaDescriptor> {

Review Comment:
   This reverts the changes made in 
https://github.com/apache/arrow-rs/pull/6313/files#diff-6a684124e78f254fa6ea23b98f04be0a3ff5a2e4b24890d9c6f4dd850ba11333L232



##########
parquet/src/file/properties.rs:
##########
@@ -287,15 +286,13 @@ impl WriterProperties {
         self.statistics_truncate_length
     }
 
-    /// Returns `coerce_types` boolean
+    /// Should the writer coerce types to  parquet native types.
+    ///
+    /// Setting this option to `true` will result in parquet files that can be
+    /// read by more readers, but may lose precision for arrow types such as
+    /// [`DataType::Date64`] which have no direct corresponding Parquet type.

Review Comment:
   I also felt it would help if we described more explicitly what the impact of 
enabling this option was (and left a link to the longer backstory)



##########
parquet/src/arrow/schema/mod.rs:
##########
@@ -225,29 +225,99 @@ pub(crate) fn 
add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
     }
 }
 
+/// Converter for arrow schema to parquet schema
+///
+/// Example:
+/// ```
+/// # use arrow_schema::{Field, Schema, DataType};
+/// use parquet::arrow::ArrowToParquetSchemaConverter;
+/// let schema = Schema::new(vec![
+///   Field::new("a", DataType::Int64, false),
+///   Field::new("b", DataType::Date32, false),
+/// ];
+///
+/// let parquet_schema = ArrowToParquetSchemaConverter::new(&schema)
+///   .build()
+///   .unwrap();
+///
+///
+/// ```
+#[derive(Debug)]
+pub struct ArrowToParquetSchemaConverter<'a> {
+    /// The schema to convert
+    schema: &'a Schema,
+    /// Name of the root schema in Parquet
+    schema_root: &'a str,
+    /// Should we Coerce arrow types to compatible Parquet types?
+    ///
+    /// See docs on [Self::with_coerce_types]`
+    coerce_types: bool
+}
+
+impl <'a> ArrowToParquetSchemaConverter<'a> {
+    /// Create a new converter
+    pub fn new(schema: &'a Schema) -> Self {
+        Self {
+            schema,
+            schema_root: "arrow_schema",
+            coerce_types: false,
+        }
+    }
+
+    /// Should arrow types be coerced into parquet native types (default 
false).
+    ///
+    /// Setting this option to `true` will result in parquet files that can be
+    /// read by more readers, but may lose precision for arrow types such as
+    /// [`DataType::Date64`] which have no direct corresponding Parquet type.
+    ///
+    /// # Discussion
+    ///
+    /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no
+    /// corresponding Parquet logical type. Thus, they can not be losslessly
+    /// round-tripped when stored using the appropriate Parquet logical type.
+    ///
+    /// For example, some Date64 values may be truncated when stored with
+    /// parquet's native 32 bit date type.
+    ///
+    /// By default, the arrow writer does not coerce to native parquet types. 
It
+    /// writes data in such a way that it can be lossless round tripped.
+    /// However, this means downstream readers must be aware of and correctly
+    /// interpret the embedded Arrow schema.
+    pub fn with_coerce_types(mut self, coerce_types: bool) -> Self {
+        self.coerce_types = coerce_types;
+        self
+    }
+
+    /// Set the root schema element name (defaults to `"arrow_schema"`).
+    pub fn schema_root(mut self, schema_root: &'a str) -> Self {
+        self.schema_root = schema_root;
+        self
+    }
+
+    /// Build the desired parquet [`SchemaDescriptor`]
+    pub fn build(self) -> Result<SchemaDescriptor> {
+        let Self { schema, schema_root: root_schema_name, coerce_types } = 
self;
+        let fields = schema
+            .fields()
+            .iter()
+            .map(|field| arrow_to_parquet_type(field, 
coerce_types).map(Arc::new))
+            .collect::<Result<_>>()?;
+        let group = 
Type::group_type_builder(root_schema_name).with_fields(fields).build()?;
+        Ok(SchemaDescriptor::new(Arc::new(group)))
+    }
+}
+
 /// Convert arrow schema to parquet schema
 ///
 /// The name of the root schema element defaults to `"arrow_schema"`, this can 
be
 /// overridden with [`arrow_to_parquet_schema_with_root`]
-pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> 
Result<SchemaDescriptor> {
-    arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types)
-}
+#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` 
instead")]
+pub fn arrow_to_parquet_schema(schema: &Schema) -> Result<SchemaDescriptor> {
 
-/// Convert arrow schema to parquet schema specifying the name of the root 
schema element
-pub fn arrow_to_parquet_schema_with_root(

Review Comment:
   It turns out this function is not actually exported (it is pub in this 
module, but not pub exported):  
   
   
https://docs.rs/parquet/latest/parquet/?search=arrow_to_parquet_schema_with_root
   
   Returns no results
   
   The compiler told me it was unused once I switched everything over to use 
`ArrowToParquetSchemaConverter`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add `ArrowToParquetSchemaConverter`, deprecate `arrow_to_parquet_schema` [arrow-rs]

Reply via email to