rluvaton commented on code in PR #8496:
URL: https://github.com/apache/arrow-rs/pull/8496#discussion_r2389078447
##########
parquet/src/arrow/schema/complex.rs:
##########
@@ -620,7 +725,690 @@ pub fn convert_type(parquet_type: &TypePtr) ->
Result<ParquetField> {
rep_level: 0,
def_level: 0,
data_type: None,
+ // We might be inside list
+ treat_repeated_as_list_arrow_hint: false,
};
Ok(visitor.dispatch(parquet_type, context)?.unwrap())
}
+
+#[cfg(test)]
+mod tests {
+ use crate::arrow::schema::complex::convert_schema;
+ use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY};
+ use crate::schema::parser::parse_message_type;
+ use crate::schema::types::SchemaDescriptor;
+ use arrow_schema::{DataType, Field, Fields};
+ use std::sync::Arc;
+
+ trait WithFieldId {
+ fn with_field_id(self, id: i32) -> Self;
+ }
+ impl WithFieldId for arrow_schema::Field {
+ fn with_field_id(self, id: i32) -> Self {
+ let mut metadata = self.metadata().clone();
+ metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(),
id.to_string());
+ self.with_metadata(metadata)
+ }
+ }
+
+ fn test_roundtrip(message_type: &str) -> crate::errors::Result<()> {
+ let parsed_input_schema = Arc::new(parse_message_type(message_type)?);
+ let schema = SchemaDescriptor::new(parsed_input_schema);
+
+ let converted = convert_schema(&schema, ProjectionMask::all(),
None)?.unwrap();
+
+ let DataType::Struct(schema_fields) = &converted.arrow_type else {
+ panic!("Expected struct from convert_schema");
+ };
+
+ // Should be able to convert the same thing
+ let converted_again =
+ convert_schema(&schema, ProjectionMask::all(),
Some(schema_fields))?.unwrap();
+
+ // Assert that we changed to Utf8
+ assert_eq!(converted_again.arrow_type, converted.arrow_type);
+
+ Ok(())
+ }
+
+ fn test_expected_type(
+ message_type: &str,
+ expected_fields: Fields,
+ ) -> crate::errors::Result<()> {
+ test_roundtrip(message_type)?;
+
+ let parsed_input_schema = Arc::new(parse_message_type(message_type)?);
+ let schema = SchemaDescriptor::new(parsed_input_schema);
+
+ let converted = convert_schema(&schema, ProjectionMask::all(),
None)?.unwrap();
+
+ let DataType::Struct(schema_fields) = &converted.arrow_type else {
+ panic!("Expected struct from convert_schema");
+ };
+
+ assert_eq!(schema_fields, &expected_fields);
+
+ Ok(())
+ }
+
+ #[test]
+ fn basic_backward_compatible_list_1() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated int32 element;
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 1: List<Integer> (nullable list, non-null elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new("element",
DataType::Int32, false))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
Review Comment:
Ignored as this is failing here and in main
##########
parquet/src/arrow/schema/complex.rs:
##########
@@ -620,7 +725,690 @@ pub fn convert_type(parquet_type: &TypePtr) ->
Result<ParquetField> {
rep_level: 0,
def_level: 0,
data_type: None,
+ // We might be inside list
+ treat_repeated_as_list_arrow_hint: false,
};
Ok(visitor.dispatch(parquet_type, context)?.unwrap())
}
+
+#[cfg(test)]
+mod tests {
+ use crate::arrow::schema::complex::convert_schema;
+ use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY};
+ use crate::schema::parser::parse_message_type;
+ use crate::schema::types::SchemaDescriptor;
+ use arrow_schema::{DataType, Field, Fields};
+ use std::sync::Arc;
+
+ trait WithFieldId {
+ fn with_field_id(self, id: i32) -> Self;
+ }
+ impl WithFieldId for arrow_schema::Field {
+ fn with_field_id(self, id: i32) -> Self {
+ let mut metadata = self.metadata().clone();
+ metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(),
id.to_string());
+ self.with_metadata(metadata)
+ }
+ }
+
+ fn test_roundtrip(message_type: &str) -> crate::errors::Result<()> {
+ let parsed_input_schema = Arc::new(parse_message_type(message_type)?);
+ let schema = SchemaDescriptor::new(parsed_input_schema);
+
+ let converted = convert_schema(&schema, ProjectionMask::all(),
None)?.unwrap();
+
+ let DataType::Struct(schema_fields) = &converted.arrow_type else {
+ panic!("Expected struct from convert_schema");
+ };
+
+ // Should be able to convert the same thing
+ let converted_again =
+ convert_schema(&schema, ProjectionMask::all(),
Some(schema_fields))?.unwrap();
+
+ // Assert that we changed to Utf8
+ assert_eq!(converted_again.arrow_type, converted.arrow_type);
+
+ Ok(())
+ }
+
+ fn test_expected_type(
+ message_type: &str,
+ expected_fields: Fields,
+ ) -> crate::errors::Result<()> {
+ test_roundtrip(message_type)?;
+
+ let parsed_input_schema = Arc::new(parse_message_type(message_type)?);
+ let schema = SchemaDescriptor::new(parsed_input_schema);
+
+ let converted = convert_schema(&schema, ProjectionMask::all(),
None)?.unwrap();
+
+ let DataType::Struct(schema_fields) = &converted.arrow_type else {
+ panic!("Expected struct from convert_schema");
+ };
+
+ assert_eq!(schema_fields, &expected_fields);
+
+ Ok(())
+ }
+
+ #[test]
+ fn basic_backward_compatible_list_1() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated int32 element;
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 1: List<Integer> (nullable list, non-null elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new("element",
DataType::Int32, false))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
+ fn basic_backward_compatible_list_2() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated group element {
+ required binary str (STRING);
+ required int32 num;
+ };
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 2: List<Tuple<String, Integer>> (nullable list,
non-null elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new(
+ "element",
+ DataType::Struct(Fields::from(vec![
+ Field::new("str", DataType::Binary, false),
+ Field::new("num", DataType::Int32, false),
+ ])),
+ false,
+ ))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
Review Comment:
Ignored as this is failing here and in main
##########
parquet/src/arrow/schema/complex.rs:
##########
@@ -620,7 +725,690 @@ pub fn convert_type(parquet_type: &TypePtr) ->
Result<ParquetField> {
rep_level: 0,
def_level: 0,
data_type: None,
+ // We might be inside list
+ treat_repeated_as_list_arrow_hint: false,
};
Ok(visitor.dispatch(parquet_type, context)?.unwrap())
}
+
+#[cfg(test)]
+mod tests {
+ use crate::arrow::schema::complex::convert_schema;
+ use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY};
+ use crate::schema::parser::parse_message_type;
+ use crate::schema::types::SchemaDescriptor;
+ use arrow_schema::{DataType, Field, Fields};
+ use std::sync::Arc;
+
+ trait WithFieldId {
+ fn with_field_id(self, id: i32) -> Self;
+ }
+ impl WithFieldId for arrow_schema::Field {
+ fn with_field_id(self, id: i32) -> Self {
+ let mut metadata = self.metadata().clone();
+ metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(),
id.to_string());
+ self.with_metadata(metadata)
+ }
+ }
+
+ fn test_roundtrip(message_type: &str) -> crate::errors::Result<()> {
+ let parsed_input_schema = Arc::new(parse_message_type(message_type)?);
+ let schema = SchemaDescriptor::new(parsed_input_schema);
+
+ let converted = convert_schema(&schema, ProjectionMask::all(),
None)?.unwrap();
+
+ let DataType::Struct(schema_fields) = &converted.arrow_type else {
+ panic!("Expected struct from convert_schema");
+ };
+
+ // Should be able to convert the same thing
+ let converted_again =
+ convert_schema(&schema, ProjectionMask::all(),
Some(schema_fields))?.unwrap();
+
+ // Assert that we changed to Utf8
+ assert_eq!(converted_again.arrow_type, converted.arrow_type);
+
+ Ok(())
+ }
+
+ fn test_expected_type(
+ message_type: &str,
+ expected_fields: Fields,
+ ) -> crate::errors::Result<()> {
+ test_roundtrip(message_type)?;
+
+ let parsed_input_schema = Arc::new(parse_message_type(message_type)?);
+ let schema = SchemaDescriptor::new(parsed_input_schema);
+
+ let converted = convert_schema(&schema, ProjectionMask::all(),
None)?.unwrap();
+
+ let DataType::Struct(schema_fields) = &converted.arrow_type else {
+ panic!("Expected struct from convert_schema");
+ };
+
+ assert_eq!(schema_fields, &expected_fields);
+
+ Ok(())
+ }
+
+ #[test]
+ fn basic_backward_compatible_list_1() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated int32 element;
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 1: List<Integer> (nullable list, non-null elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new("element",
DataType::Int32, false))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
+ fn basic_backward_compatible_list_2() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated group element {
+ required binary str (STRING);
+ required int32 num;
+ };
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 2: List<Tuple<String, Integer>> (nullable list,
non-null elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new(
+ "element",
+ DataType::Struct(Fields::from(vec![
+ Field::new("str", DataType::Binary, false),
+ Field::new("num", DataType::Int32, false),
+ ])),
+ false,
+ ))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
+ fn basic_backward_compatible_list_3() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated group array (LIST) {
+ repeated int32 array;
+ };
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 3: List<List<Integer>> (nullable outer list, non-null
elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new(
+ "array",
+ DataType::List(Arc::new(Field::new("array",
DataType::Int32, false))),
+ false,
+ ))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
Review Comment:
Ignored as this is failing here and in main
##########
parquet/src/arrow/schema/complex.rs:
##########
@@ -620,7 +725,690 @@ pub fn convert_type(parquet_type: &TypePtr) ->
Result<ParquetField> {
rep_level: 0,
def_level: 0,
data_type: None,
+ // We might be inside list
+ treat_repeated_as_list_arrow_hint: false,
};
Ok(visitor.dispatch(parquet_type, context)?.unwrap())
}
+
+#[cfg(test)]
+mod tests {
+ use crate::arrow::schema::complex::convert_schema;
+ use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY};
+ use crate::schema::parser::parse_message_type;
+ use crate::schema::types::SchemaDescriptor;
+ use arrow_schema::{DataType, Field, Fields};
+ use std::sync::Arc;
+
+ trait WithFieldId {
+ fn with_field_id(self, id: i32) -> Self;
+ }
+ impl WithFieldId for arrow_schema::Field {
+ fn with_field_id(self, id: i32) -> Self {
+ let mut metadata = self.metadata().clone();
+ metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(),
id.to_string());
+ self.with_metadata(metadata)
+ }
+ }
+
+ fn test_roundtrip(message_type: &str) -> crate::errors::Result<()> {
+ let parsed_input_schema = Arc::new(parse_message_type(message_type)?);
+ let schema = SchemaDescriptor::new(parsed_input_schema);
+
+ let converted = convert_schema(&schema, ProjectionMask::all(),
None)?.unwrap();
+
+ let DataType::Struct(schema_fields) = &converted.arrow_type else {
+ panic!("Expected struct from convert_schema");
+ };
+
+ // Should be able to convert the same thing
+ let converted_again =
+ convert_schema(&schema, ProjectionMask::all(),
Some(schema_fields))?.unwrap();
+
+ // Assert that we changed to Utf8
+ assert_eq!(converted_again.arrow_type, converted.arrow_type);
+
+ Ok(())
+ }
+
+ fn test_expected_type(
+ message_type: &str,
+ expected_fields: Fields,
+ ) -> crate::errors::Result<()> {
+ test_roundtrip(message_type)?;
+
+ let parsed_input_schema = Arc::new(parse_message_type(message_type)?);
+ let schema = SchemaDescriptor::new(parsed_input_schema);
+
+ let converted = convert_schema(&schema, ProjectionMask::all(),
None)?.unwrap();
+
+ let DataType::Struct(schema_fields) = &converted.arrow_type else {
+ panic!("Expected struct from convert_schema");
+ };
+
+ assert_eq!(schema_fields, &expected_fields);
+
+ Ok(())
+ }
+
+ #[test]
+ fn basic_backward_compatible_list_1() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated int32 element;
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 1: List<Integer> (nullable list, non-null elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new("element",
DataType::Int32, false))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
+ fn basic_backward_compatible_list_2() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated group element {
+ required binary str (STRING);
+ required int32 num;
+ };
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 2: List<Tuple<String, Integer>> (nullable list,
non-null elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new(
+ "element",
+ DataType::Struct(Fields::from(vec![
+ Field::new("str", DataType::Binary, false),
+ Field::new("num", DataType::Int32, false),
+ ])),
+ false,
+ ))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
+ fn basic_backward_compatible_list_3() -> crate::errors::Result<()> {
+ test_expected_type(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated group array (LIST) {
+ repeated int32 array;
+ };
+ }
+ }
+ ",
+ Fields::from(vec![
+ // Rule 3: List<List<Integer>> (nullable outer list, non-null
elements)
+ Field::new(
+ "my_list",
+ DataType::List(Arc::new(Field::new(
+ "array",
+ DataType::List(Arc::new(Field::new("array",
DataType::Int32, false))),
+ false,
+ ))),
+ true,
+ ),
+ ]),
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
+ fn basic_backward_compatible_list_4_1() -> crate::errors::Result<()> {
+ test_roundtrip(
+ "
+ message schema {
+ optional group my_list (LIST) {
+ repeated group array {
+ required binary str (STRING);
+ };
+ }
+ }
+ ",
+ )
+ }
+
+ #[test]
+ #[ignore = "not working yet"]
Review Comment:
Ignored as this is failing here and in main
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]