[arrow] branch master updated: ARROW-4060: [Rust] Add parquet arrow converter.

agrove Tue, 08 Jan 2019 05:45:57 -0800

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new af07f75  ARROW-4060: [Rust] Add parquet arrow converter.
af07f75 is described below

commit af07f75c1f692d1ed4cea93d358ff1acda6a1771
Author: Renjie Liu <[email protected]>
AuthorDate: Tue Jan 8 06:45:13 2019 -0700

    ARROW-4060: [Rust] Add parquet arrow converter.
    
    This is the first step of adding an arrow reader and writer for parquet-rs.
    This commit contains a converter which converts parquet schema to arrow 
schema.
    Copied from this pr https://github.com/sunchao/parquet-rs/pull/185.
    
    Author: Renjie Liu <[email protected]>
    
    Closes #3279 from liurenjie1024/rust-arrow-schema-converter and squashes 
the following commits:
    
    1bfa00f <Renjie Liu> Resolve conflict
    8806b16 <Renjie Liu> Add parquet arrow converter
---
 rust/parquet/src/errors.rs                 |   6 +
 rust/parquet/src/lib.rs                    |   1 +
 rust/parquet/src/{lib.rs => reader/mod.rs} |  28 +-
 rust/parquet/src/reader/schema.rs          | 779 +++++++++++++++++++++++++++++
 rust/parquet/src/schema/types.rs           |  14 +-
 5 files changed, 805 insertions(+), 23 deletions(-)

diff --git a/rust/parquet/src/errors.rs b/rust/parquet/src/errors.rs
index a5532c1..abfbda9 100644
--- a/rust/parquet/src/errors.rs
+++ b/rust/parquet/src/errors.rs
@@ -50,6 +50,12 @@ quick_error! {
           display("EOF: {}", message)
               description(message)
       }
+      /// Arrow error.
+      /// Returned when reading into arrow or writing from arrow.
+      ArrowError(message:  String) {
+          display("Arrow: {}", message)
+              description(message)
+      }
   }
 }
 
diff --git a/rust/parquet/src/lib.rs b/rust/parquet/src/lib.rs
index 75c56f5..cad85ec 100644
--- a/rust/parquet/src/lib.rs
+++ b/rust/parquet/src/lib.rs
@@ -37,5 +37,6 @@ pub mod column;
 pub mod compression;
 mod encodings;
 pub mod file;
+pub mod reader;
 pub mod record;
 pub mod schema;
diff --git a/rust/parquet/src/lib.rs b/rust/parquet/src/reader/mod.rs
similarity index 64%
copy from rust/parquet/src/lib.rs
copy to rust/parquet/src/reader/mod.rs
index 75c56f5..fe580c5 100644
--- a/rust/parquet/src/lib.rs
+++ b/rust/parquet/src/reader/mod.rs
@@ -15,27 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#![feature(type_ascription)]
-#![feature(rustc_private)]
-#![feature(specialization)]
-#![feature(try_from)]
-#![allow(dead_code)]
-#![allow(non_camel_case_types)]
+//! [Apache Arrow](http://arrow.apache.org/) is a cross-language development 
platform for
+//! in-memory data.
+//!
+//! This mod provides API for converting between arrow and parquet.
 
-#[macro_use]
-pub mod errors;
-pub mod basic;
-pub mod data_type;
-
-// Exported for external use, such as benchmarks
-pub use self::encodings::{decoding, encoding};
-pub use self::util::memory;
-
-#[macro_use]
-mod util;
-pub mod column;
-pub mod compression;
-mod encodings;
-pub mod file;
-pub mod record;
 pub mod schema;
+
+pub use self::schema::{parquet_to_arrow_schema, 
parquet_to_arrow_schema_by_columns};
diff --git a/rust/parquet/src/reader/schema.rs 
b/rust/parquet/src/reader/schema.rs
new file mode 100644
index 0000000..68fd867
--- /dev/null
+++ b/rust/parquet/src/reader/schema.rs
@@ -0,0 +1,779 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Provides API for converting parquet schema to arrow schema and vice versa.
+//!
+//! The main interfaces for converting parquet schema to arrow schema  are
+//! `parquet_to_arrow_schema` and `parquet_to_arrow_schema_by_columns`.
+//!
+//! The interfaces for converting arrow schema to parquet schema is coming.
+
+use std::{collections::HashSet, rc::Rc};
+
+use crate::basic::{LogicalType, Repetition, Type as PhysicalType};
+use crate::errors::{ParquetError::ArrowError, Result};
+use crate::schema::types::{SchemaDescPtr, Type, TypePtr};
+
+use arrow::datatypes::{DataType, Field, Schema};
+
+/// Convert parquet schema to arrow schema.
+pub fn parquet_to_arrow_schema(parquet_schema: SchemaDescPtr) -> 
Result<Schema> {
+    parquet_to_arrow_schema_by_columns(parquet_schema.clone(), 
0..parquet_schema.columns().len())
+}
+
+/// Convert parquet schema to arrow schema, only preserving some leaf columns.
+pub fn parquet_to_arrow_schema_by_columns<T>(
+    parquet_schema: SchemaDescPtr,
+    column_indices: T,
+) -> Result<Schema>
+where
+    T: IntoIterator<Item = usize>,
+{
+    let mut base_nodes = Vec::new();
+    let mut base_nodes_set = HashSet::new();
+    let mut leaves = HashSet::new();
+
+    for c in column_indices {
+        let column = parquet_schema.column(c).self_type() as *const Type;
+        let root = parquet_schema.get_column_root_ptr(c);
+        let root_raw_ptr = root.clone().as_ref() as *const Type;
+
+        leaves.insert(column);
+        if !base_nodes_set.contains(&root_raw_ptr) {
+            base_nodes.push(root);
+            base_nodes_set.insert(root_raw_ptr);
+        }
+    }
+
+    let leaves = Rc::new(leaves);
+    base_nodes
+        .into_iter()
+        .map(|t| ParquetTypeConverter::new(t, leaves.clone()).to_field())
+        .collect::<Result<Vec<Option<Field>>>>()
+        .map(|result| result.into_iter().filter_map(|f| 
f).collect::<Vec<Field>>())
+        .map(|fields| Schema::new(fields))
+}
+
+/// This struct is used to group methods and data structures used to convert 
parquet
+/// schema together.
+struct ParquetTypeConverter {
+    schema: TypePtr,
+    /// This is the columns that need to be converted to arrow schema.
+    columns_to_convert: Rc<HashSet<*const Type>>,
+}
+
+impl ParquetTypeConverter {
+    fn new(schema: TypePtr, columns_to_convert: Rc<HashSet<*const Type>>) -> 
Self {
+        Self {
+            schema,
+            columns_to_convert,
+        }
+    }
+
+    fn clone_with_schema(&self, other: TypePtr) -> Self {
+        Self {
+            schema: other,
+            columns_to_convert: self.columns_to_convert.clone(),
+        }
+    }
+}
+
+impl ParquetTypeConverter {
+    // Public interfaces.
+
+    /// Converts parquet schema to arrow data type.
+    ///
+    /// This function discards schema name.
+    ///
+    /// If this schema is a primitive type and not included in the leaves, the 
result is
+    /// Ok(None).
+    ///
+    /// If this schema is a group type and none of its children is reserved in 
the
+    /// conversion, the result is Ok(None).
+    fn to_data_type(&self) -> Result<Option<DataType>> {
+        match self.schema.as_ref() {
+            Type::PrimitiveType { .. } => self.to_primitive_type(),
+            Type::GroupType { .. } => self.to_group_type(),
+        }
+    }
+
+    /// Converts parquet schema to arrow field.
+    ///
+    /// This method is roughly the same as
+    /// [`to_data_type`](`ParquetTypeConverter::to_data_type`), except it 
reserves schema
+    /// name.
+    fn to_field(&self) -> Result<Option<Field>> {
+        self.to_data_type()
+            .map(|opt| opt.map(|dt| Field::new(self.schema.name(), dt, 
self.is_nullable())))
+    }
+
+    // Utility functions.
+
+    /// Checks whether this schema is nullable.
+    fn is_nullable(&self) -> bool {
+        let basic_info = self.schema.get_basic_info();
+        if basic_info.has_repetition() {
+            match basic_info.repetition() {
+                Repetition::OPTIONAL => true,
+                Repetition::REPEATED => true,
+                Repetition::REQUIRED => false,
+            }
+        } else {
+            false
+        }
+    }
+
+    fn is_repeated(&self) -> bool {
+        let basic_info = self.schema.get_basic_info();
+
+        basic_info.has_repetition() && basic_info.repetition() == 
Repetition::REPEATED
+    }
+
+    fn is_self_included(&self) -> bool {
+        self.columns_to_convert
+            .contains(&(self.schema.as_ref() as *const Type))
+    }
+
+    // Functions for primitive types.
+
+    /// Entry point for converting parquet primitive type to arrow type.
+    ///
+    /// This function takes care of repetition.
+    fn to_primitive_type(&self) -> Result<Option<DataType>> {
+        if self.is_self_included() {
+            self.to_primitive_type_inner().map(|dt| {
+                if self.is_repeated() {
+                    Some(DataType::List(Box::new(dt)))
+                } else {
+                    Some(dt)
+                }
+            })
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Converting parquet primitive type to arrow data type.
+    fn to_primitive_type_inner(&self) -> Result<DataType> {
+        match self.schema.get_physical_type() {
+            PhysicalType::BOOLEAN => Ok(DataType::Boolean),
+            PhysicalType::INT32 => self.to_int32(),
+            PhysicalType::INT64 => self.to_int64(),
+            PhysicalType::FLOAT => Ok(DataType::Float32),
+            PhysicalType::DOUBLE => Ok(DataType::Float64),
+            PhysicalType::BYTE_ARRAY => self.to_byte_array(),
+            other => Err(ArrowError(format!(
+                "Unable to convert parquet type {}",
+                other
+            ))),
+        }
+    }
+
+    fn to_int32(&self) -> Result<DataType> {
+        match self.schema.get_basic_info().logical_type() {
+            LogicalType::NONE => Ok(DataType::Int32),
+            LogicalType::UINT_8 => Ok(DataType::UInt8),
+            LogicalType::UINT_16 => Ok(DataType::UInt16),
+            LogicalType::UINT_32 => Ok(DataType::UInt32),
+            LogicalType::INT_8 => Ok(DataType::Int8),
+            LogicalType::INT_16 => Ok(DataType::Int16),
+            LogicalType::INT_32 => Ok(DataType::Int32),
+            other => Err(ArrowError(format!(
+                "Unable to convert parquet logical type {}",
+                other
+            ))),
+        }
+    }
+
+    fn to_int64(&self) -> Result<DataType> {
+        match self.schema.get_basic_info().logical_type() {
+            LogicalType::NONE => Ok(DataType::Int64),
+            LogicalType::INT_64 => Ok(DataType::Int64),
+            LogicalType::UINT_64 => Ok(DataType::UInt64),
+            other => Err(ArrowError(format!(
+                "Unable to convert parquet logical type {}",
+                other
+            ))),
+        }
+    }
+
+    fn to_byte_array(&self) -> Result<DataType> {
+        match self.schema.get_basic_info().logical_type() {
+            LogicalType::UTF8 => Ok(DataType::Utf8),
+            other => Err(ArrowError(format!(
+                "Unable to convert parquet logical type {}",
+                other
+            ))),
+        }
+    }
+
+    // Functions for group types.
+
+    /// Entry point for converting parquet group type.
+    ///
+    /// This function takes care of logical type and repetition.
+    fn to_group_type(&self) -> Result<Option<DataType>> {
+        if self.is_repeated() {
+            self.to_struct()
+                .map(|opt| opt.map(|dt| DataType::List(Box::new(dt))))
+        } else {
+            match self.schema.get_basic_info().logical_type() {
+                LogicalType::LIST => self.to_list(),
+                _ => self.to_struct(),
+            }
+        }
+    }
+
+    /// Converts a parquet group type to arrow struct.
+    fn to_struct(&self) -> Result<Option<DataType>> {
+        match self.schema.as_ref() {
+            Type::PrimitiveType { .. } => panic!(
+                "{:?} is a struct type, and can't be processed as primitive.",
+                self.schema
+            ),
+            Type::GroupType {
+                basic_info: _,
+                fields,
+            } => fields
+                .iter()
+                .map(|field_ptr| 
self.clone_with_schema(field_ptr.clone()).to_field())
+                .collect::<Result<Vec<Option<Field>>>>()
+                .map(|result| result.into_iter().filter_map(|f| 
f).collect::<Vec<Field>>())
+                .map(|fields| {
+                    if fields.is_empty() {
+                        None
+                    } else {
+                        Some(DataType::Struct(fields))
+                    }
+                }),
+        }
+    }
+
+    /// Converts a parquet list to arrow list.
+    ///
+    /// To fully understand this algorithm, please refer to
+    /// [parquet 
doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md).
+    fn to_list(&self) -> Result<Option<DataType>> {
+        match self.schema.as_ref() {
+            Type::PrimitiveType { .. } => panic!(
+                "{:?} is a list type and can't be processed as primitive.",
+                self.schema
+            ),
+            Type::GroupType {
+                basic_info: _,
+                fields,
+            } if fields.len() == 1 => {
+                let list_item = fields.first().unwrap();
+                let item_converter = self.clone_with_schema(list_item.clone());
+
+                let item_type = match list_item.as_ref() {
+                    Type::PrimitiveType { .. } => {
+                        if item_converter.is_repeated() {
+                            item_converter.to_primitive_type_inner().map(|dt| 
Some(dt))
+                        } else {
+                            Err(ArrowError(
+                                "Primitive element type of list must be 
repeated.".to_string(),
+                            ))
+                        }
+                    }
+                    Type::GroupType {
+                        basic_info: _,
+                        fields,
+                    } => {
+                        if fields.len() > 1 {
+                            item_converter.to_struct()
+                        } else if fields.len() == 1
+                            && list_item.name() != "array"
+                            && list_item.name() != format!("{}_tuple", 
self.schema.name())
+                        {
+                            let nested_item = fields.first().unwrap();
+                            let nested_item_converter = 
self.clone_with_schema(nested_item.clone());
+
+                            nested_item_converter.to_data_type()
+                        } else {
+                            item_converter.to_struct()
+                        }
+                    }
+                };
+
+                item_type.map(|opt| opt.map(|dt| DataType::List(Box::new(dt))))
+            }
+            _ => Err(ArrowError(
+                "Group element type of list can only contain one 
field.".to_string(),
+            )),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::rc::Rc;
+
+    use crate::schema::{parser::parse_message_type, types::SchemaDescriptor};
+
+    use arrow::datatypes::{DataType, Field};
+
+    use super::{parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns};
+
+    #[test]
+    fn test_flat_primitives() {
+        let message_type = "
+        message test_schema {
+            REQUIRED BOOLEAN boolean;
+            REQUIRED INT32   int8  (INT_8);
+            REQUIRED INT32   int16 (INT_16);
+            REQUIRED INT32   int32;
+            REQUIRED INT64   int64 ;
+            OPTIONAL DOUBLE  double;
+            OPTIONAL FLOAT   float;
+            OPTIONAL BINARY  string (UTF8);
+        }
+        ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+
+        let parquet_schema = 
SchemaDescriptor::new(Rc::new(parquet_group_type));
+        let converted_arrow_schema = 
parquet_to_arrow_schema(Rc::new(parquet_schema)).unwrap();
+
+        let arrow_fields = vec![
+            Field::new("boolean", DataType::Boolean, false),
+            Field::new("int8", DataType::Int8, false),
+            Field::new("int16", DataType::Int16, false),
+            Field::new("int32", DataType::Int32, false),
+            Field::new("int64", DataType::Int64, false),
+            Field::new("double", DataType::Float64, true),
+            Field::new("float", DataType::Float32, true),
+            Field::new("string", DataType::Utf8, true),
+        ];
+
+        assert_eq!(&arrow_fields, converted_arrow_schema.fields());
+    }
+
+    #[test]
+    fn test_duplicate_fields() {
+        let message_type = "
+        message test_schema {
+            REQUIRED BOOLEAN boolean;
+            REQUIRED INT32 int8 (INT_8);
+        }
+        ";
+
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+
+        let parquet_schema = 
Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+        let converted_arrow_schema = 
parquet_to_arrow_schema(parquet_schema.clone()).unwrap();
+
+        let arrow_fields = vec![
+            Field::new("boolean", DataType::Boolean, false),
+            Field::new("int8", DataType::Int8, false),
+        ];
+        assert_eq!(&arrow_fields, converted_arrow_schema.fields());
+
+        let converted_arrow_schema =
+            parquet_to_arrow_schema_by_columns(parquet_schema.clone(), 
vec![0usize, 1usize])
+                .unwrap();
+        assert_eq!(&arrow_fields, converted_arrow_schema.fields());
+    }
+
+    #[test]
+    fn test_parquet_lists() {
+        let mut arrow_fields = Vec::new();
+
+        // LIST encoding example taken from parquet-format/LogicalTypes.md
+        let message_type = "
+        message test_schema {
+          REQUIRED GROUP my_list (LIST) {
+            REPEATED GROUP list {
+              OPTIONAL BINARY element (UTF8);
+            }
+          }
+          OPTIONAL GROUP my_list (LIST) {
+            REPEATED GROUP list {
+              REQUIRED BINARY element (UTF8);
+            }
+          }
+          OPTIONAL GROUP array_of_arrays (LIST) {
+            REPEATED GROUP list {
+              REQUIRED GROUP element (LIST) {
+                REPEATED GROUP list {
+                  REQUIRED INT32 element;
+                }
+              }
+            }
+          }
+          OPTIONAL GROUP my_list (LIST) {
+            REPEATED GROUP element {
+              REQUIRED BINARY str (UTF8);
+            }
+          }
+          OPTIONAL GROUP my_list (LIST) {
+            REPEATED INT32 element;
+          }
+          OPTIONAL GROUP my_list (LIST) {
+            REPEATED GROUP element {
+              REQUIRED BINARY str (UTF8);
+              REQUIRED INT32 num;
+            }
+          }
+          OPTIONAL GROUP my_list (LIST) {
+            REPEATED GROUP array {
+              REQUIRED BINARY str (UTF8);
+            }
+
+          }
+          OPTIONAL GROUP my_list (LIST) {
+            REPEATED GROUP my_list_tuple {
+              REQUIRED BINARY str (UTF8);
+            }
+          }
+          REPEATED INT32 name;
+        }
+        ";
+
+        // // List<String> (list non-null, elements nullable)
+        // required group my_list (LIST) {
+        //   repeated group list {
+        //     optional binary element (UTF8);
+        //   }
+        // }
+        {
+            arrow_fields.push(Field::new(
+                "my_list",
+                DataType::List(Box::new(DataType::Utf8)),
+                false,
+            ));
+        }
+
+        // // List<String> (list nullable, elements non-null)
+        // optional group my_list (LIST) {
+        //   repeated group list {
+        //     required binary element (UTF8);
+        //   }
+        // }
+        {
+            arrow_fields.push(Field::new(
+                "my_list",
+                DataType::List(Box::new(DataType::Utf8)),
+                true,
+            ));
+        }
+
+        // Element types can be nested structures. For example, a list of 
lists:
+        //
+        // // List<List<Integer>>
+        // optional group array_of_arrays (LIST) {
+        //   repeated group list {
+        //     required group element (LIST) {
+        //       repeated group list {
+        //         required int32 element;
+        //       }
+        //     }
+        //   }
+        // }
+        {
+            let arrow_inner_list = DataType::List(Box::new(DataType::Int32));
+            arrow_fields.push(Field::new(
+                "array_of_arrays",
+                DataType::List(Box::new(arrow_inner_list)),
+                true,
+            ));
+        }
+
+        // // List<String> (list nullable, elements non-null)
+        // optional group my_list (LIST) {
+        //   repeated group element {
+        //     required binary str (UTF8);
+        //   };
+        // }
+        {
+            arrow_fields.push(Field::new(
+                "my_list",
+                DataType::List(Box::new(DataType::Utf8)),
+                true,
+            ));
+        }
+
+        // // List<Integer> (nullable list, non-null elements)
+        // optional group my_list (LIST) {
+        //   repeated int32 element;
+        // }
+        {
+            arrow_fields.push(Field::new(
+                "my_list",
+                DataType::List(Box::new(DataType::Int32)),
+                true,
+            ));
+        }
+
+        // // List<Tuple<String, Integer>> (nullable list, non-null elements)
+        // optional group my_list (LIST) {
+        //   repeated group element {
+        //     required binary str (UTF8);
+        //     required int32 num;
+        //   };
+        // }
+        {
+            let arrow_struct = DataType::Struct(vec![
+                Field::new("str", DataType::Utf8, false),
+                Field::new("num", DataType::Int32, false),
+            ]);
+            arrow_fields.push(Field::new(
+                "my_list",
+                DataType::List(Box::new(arrow_struct)),
+                true,
+            ));
+        }
+
+        // // List<OneTuple<String>> (nullable list, non-null elements)
+        // optional group my_list (LIST) {
+        //   repeated group array {
+        //     required binary str (UTF8);
+        //   };
+        // }
+        // Special case: group is named array
+        {
+            let arrow_struct = DataType::Struct(vec![Field::new("str", 
DataType::Utf8, false)]);
+            arrow_fields.push(Field::new(
+                "my_list",
+                DataType::List(Box::new(arrow_struct)),
+                true,
+            ));
+        }
+
+        // // List<OneTuple<String>> (nullable list, non-null elements)
+        // optional group my_list (LIST) {
+        //   repeated group my_list_tuple {
+        //     required binary str (UTF8);
+        //   };
+        // }
+        // Special case: group named ends in _tuple
+        {
+            let arrow_struct = DataType::Struct(vec![Field::new("str", 
DataType::Utf8, false)]);
+            arrow_fields.push(Field::new(
+                "my_list",
+                DataType::List(Box::new(arrow_struct)),
+                true,
+            ));
+        }
+
+        // One-level encoding: Only allows required lists with required cells
+        //   repeated value_type name
+        {
+            arrow_fields.push(Field::new(
+                "name",
+                DataType::List(Box::new(DataType::Int32)),
+                true,
+            ));
+        }
+
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+
+        let parquet_schema = 
Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+        let converted_arrow_schema = 
parquet_to_arrow_schema(parquet_schema.clone()).unwrap();
+        let converted_fields = converted_arrow_schema.fields();
+
+        assert_eq!(arrow_fields.len(), converted_fields.len());
+        for i in 0..arrow_fields.len() {
+            assert_eq!(arrow_fields[i], converted_fields[i]);
+        }
+    }
+
+    #[test]
+    fn test_nested_schema() {
+        let mut arrow_fields = Vec::new();
+        {
+            let group1_fields = vec![
+                Field::new("leaf1", DataType::Boolean, false),
+                Field::new("leaf2", DataType::Int32, false),
+            ];
+            let group1_struct = Field::new("group1", 
DataType::Struct(group1_fields), false);
+            arrow_fields.push(group1_struct);
+
+            let leaf3_field = Field::new("leaf3", DataType::Int64, false);
+            arrow_fields.push(leaf3_field);
+        }
+
+        let message_type = "
+        message test_schema {
+          REQUIRED GROUP group1 {
+            REQUIRED BOOLEAN leaf1;
+            REQUIRED INT32 leaf2;
+          }
+          REQUIRED INT64 leaf3;
+        }
+        ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+
+        let parquet_schema = 
Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+        let converted_arrow_schema = 
parquet_to_arrow_schema(parquet_schema.clone()).unwrap();
+        let converted_fields = converted_arrow_schema.fields();
+
+        assert_eq!(arrow_fields.len(), converted_fields.len());
+        for i in 0..arrow_fields.len() {
+            assert_eq!(arrow_fields[i], converted_fields[i]);
+        }
+    }
+
+    #[test]
+    fn test_nested_schema_partial() {
+        let mut arrow_fields = Vec::new();
+        {
+            let group1_fields = vec![Field::new("leaf1", DataType::Int64, 
false)];
+            let group1 = Field::new("group1", DataType::Struct(group1_fields), 
false);
+            arrow_fields.push(group1);
+
+            let group2_fields = vec![Field::new("leaf4", DataType::Int64, 
false)];
+            let group2 = Field::new("group2", DataType::Struct(group2_fields), 
false);
+            arrow_fields.push(group2);
+
+            arrow_fields.push(Field::new("leaf5", DataType::Int64, false));
+        }
+
+        let message_type = "
+        message test_schema {
+          REQUIRED GROUP group1 {
+            REQUIRED INT64 leaf1;
+            REQUIRED INT64 leaf2;
+          }
+          REQUIRED  GROUP group2 {
+            REQUIRED INT64 leaf3;
+            REQUIRED INT64 leaf4;
+          }
+          REQUIRED INT64 leaf5;
+        }
+        ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+
+        // Expected partial arrow schema (columns 0, 3, 4):
+        // required group group1 {
+        //   required int64 leaf1;
+        // }
+        // required group group2 {
+        //   required int64 leaf4;
+        // }
+        // required int64 leaf5;
+
+        let parquet_schema = 
Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+        let converted_arrow_schema =
+            parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![0, 
3, 4]).unwrap();
+        let converted_fields = converted_arrow_schema.fields();
+
+        assert_eq!(arrow_fields.len(), converted_fields.len());
+        for i in 0..arrow_fields.len() {
+            assert_eq!(arrow_fields[i], converted_fields[i]);
+        }
+    }
+
+    #[test]
+    fn test_nested_schema_partial_ordering() {
+        let mut arrow_fields = Vec::new();
+        {
+            let group2_fields = vec![Field::new("leaf4", DataType::Int64, 
false)];
+            let group2 = Field::new("group2", DataType::Struct(group2_fields), 
false);
+            arrow_fields.push(group2);
+
+            arrow_fields.push(Field::new("leaf5", DataType::Int64, false));
+
+            let group1_fields = vec![Field::new("leaf1", DataType::Int64, 
false)];
+            let group1 = Field::new("group1", DataType::Struct(group1_fields), 
false);
+            arrow_fields.push(group1);
+        }
+
+        let message_type = "
+        message test_schema {
+          REQUIRED GROUP group1 {
+            REQUIRED INT64 leaf1;
+            REQUIRED INT64 leaf2;
+          }
+          REQUIRED  GROUP group2 {
+            REQUIRED INT64 leaf3;
+            REQUIRED INT64 leaf4;
+          }
+          REQUIRED INT64 leaf5;
+        }
+        ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+
+        // Expected partial arrow schema (columns 3, 4, 0):
+        // required group group1 {
+        //   required int64 leaf1;
+        // }
+        // required group group2 {
+        //   required int64 leaf4;
+        // }
+        // required int64 leaf5;
+
+        let parquet_schema = 
Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+        let converted_arrow_schema =
+            parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![3, 
4, 0]).unwrap();
+        let converted_fields = converted_arrow_schema.fields();
+
+        assert_eq!(arrow_fields.len(), converted_fields.len());
+        for i in 0..arrow_fields.len() {
+            assert_eq!(arrow_fields[i], converted_fields[i]);
+        }
+    }
+
+    #[test]
+    fn test_repeated_nested_schema() {
+        let mut arrow_fields = Vec::new();
+        {
+            arrow_fields.push(Field::new("leaf1", DataType::Int32, true));
+
+            let inner_group_list = Field::new(
+                "innerGroup",
+                DataType::List(Box::new(DataType::Struct(vec![Field::new(
+                    "leaf3",
+                    DataType::Int32,
+                    true,
+                )]))),
+                true,
+            );
+
+            let outer_group_list = Field::new(
+                "outerGroup",
+                DataType::List(Box::new(DataType::Struct(vec![
+                    Field::new("leaf2", DataType::Int32, true),
+                    inner_group_list,
+                ]))),
+                true,
+            );
+            arrow_fields.push(outer_group_list);
+        }
+
+        let message_type = "
+        message test_schema {
+          OPTIONAL INT32 leaf1;
+          REPEATED GROUP outerGroup {
+            OPTIONAL INT32 leaf2;
+            REPEATED GROUP innerGroup {
+              OPTIONAL INT32 leaf3;
+            }
+          }
+        }
+        ";
+        let parquet_group_type = parse_message_type(message_type).unwrap();
+
+        let parquet_schema = 
Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+        let converted_arrow_schema = 
parquet_to_arrow_schema(parquet_schema.clone()).unwrap();
+        let converted_fields = converted_arrow_schema.fields();
+
+        assert_eq!(arrow_fields.len(), converted_fields.len());
+        for i in 0..arrow_fields.len() {
+            assert_eq!(arrow_fields[i], converted_fields[i]);
+        }
+    }
+}
diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs
index 30ee9f6..aa314d6 100644
--- a/rust/parquet/src/schema/types.rs
+++ b/rust/parquet/src/schema/types.rs
@@ -741,19 +741,31 @@ impl SchemaDescriptor {
 
     /// Returns column root [`Type`](`::schema::types::Type`) for a field 
position.
     pub fn get_column_root(&self, i: usize) -> &Type {
+        let result = self.column_root_of(i);
+        result.as_ref()
+    }
+
+    /// Returns column root [`Type`](`::schema::types::Type`) pointer for a 
field position.
+    pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
+        let result = self.column_root_of(i);
+        result.clone()
+    }
+
+    fn column_root_of(&self, i: usize) -> &Rc<Type> {
         assert!(
             i < self.leaves.len(),
             "Index out of bound: {} not in [0, {})",
             i,
             self.leaves.len()
         );
+
         let result = self.leaf_to_base.get(&i);
         assert!(
             result.is_some(),
             "Expected a value for index {} but found None",
             i
         );
-        result.unwrap().as_ref()
+        result.unwrap()
     }
 
     /// Returns schema as [`Type`](`::schema::types::Type`).

[arrow] branch master updated: ARROW-4060: [Rust] Add parquet arrow converter.

Reply via email to