This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6348dc3d0c0 Accept parquet schemas without explicitly required Map
keys (#5630)
6348dc3d0c0 is described below
commit 6348dc3d0c055a27fe5c111c2cbbbfdadf92ab9d
Author: Pieter Raubenheimer <[email protected]>
AuthorDate: Tue Apr 30 16:54:00 2024 +0100
Accept parquet schemas without explicitly required Map keys (#5630)
* Accept parquet schemas without explicitly required Map keys
* Map key is always non-nullable
* Add comment
* Add test for incorrect_map_schema.parquet
* Test map data
* Bump submodule
* Clippy and code comment suggestion
---
parquet-testing | 2 +-
parquet/src/arrow/arrow_reader/mod.rs | 40 +++++++++++++++++++++++++++++++++++
parquet/src/arrow/schema/complex.rs | 18 +++++++++++++---
parquet/src/arrow/schema/mod.rs | 24 +++++++++++++++++++++
4 files changed, 80 insertions(+), 4 deletions(-)
diff --git a/parquet-testing b/parquet-testing
index 4cb3cff24c9..1ba34478f53 160000
--- a/parquet-testing
+++ b/parquet-testing
@@ -1 +1 @@
-Subproject commit 4cb3cff24c965fb329cdae763eabce47395a68a0
+Subproject commit 1ba34478f535c89382263c42c675a9af4f57f2dd
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 78d0fd6da8a..a30bf168619 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1436,6 +1436,46 @@ mod tests {
assert_eq!(row_count, 300);
}
+ #[test]
+ fn test_read_incorrect_map_schema_file() {
+ let testdata = arrow::util::test_util::parquet_test_data();
+ // see https://github.com/apache/parquet-testing/pull/47
+ let path = format!("{testdata}/incorrect_map_schema.parquet");
+ let file = File::open(path).unwrap();
+ let mut record_reader = ParquetRecordBatchReader::try_new(file,
32).unwrap();
+
+ let batch = record_reader.next().unwrap().unwrap();
+ assert_eq!(batch.num_rows(), 1);
+
+ let expected_schema = Schema::new(Fields::from(vec![Field::new(
+ "my_map",
+ ArrowDataType::Map(
+ Arc::new(Field::new(
+ "key_value",
+ ArrowDataType::Struct(Fields::from(vec![
+ Field::new("key", ArrowDataType::Utf8, false),
+ Field::new("value", ArrowDataType::Utf8, true),
+ ])),
+ false,
+ )),
+ false,
+ ),
+ true,
+ )]));
+ assert_eq!(batch.schema().as_ref(), &expected_schema);
+
+ assert_eq!(batch.num_rows(), 1);
+ assert_eq!(batch.column(0).null_count(), 0);
+ assert_eq!(
+ batch.column(0).as_map().keys().as_ref(),
+ &StringArray::from(vec!["parent", "name"])
+ );
+ assert_eq!(
+ batch.column(0).as_map().values().as_ref(),
+ &StringArray::from(vec!["another", "report"])
+ );
+ }
+
/// Parameters for single_column_reader_test
#[derive(Clone)]
struct TestOptions {
diff --git a/parquet/src/arrow/schema/complex.rs
b/parquet/src/arrow/schema/complex.rs
index 9f85b2c284c..3107a282e0d 100644
--- a/parquet/src/arrow/schema/complex.rs
+++ b/parquet/src/arrow/schema/complex.rs
@@ -286,8 +286,16 @@ impl Visitor {
let map_key = &map_key_value.get_fields()[0];
let map_value = &map_key_value.get_fields()[1];
- if map_key.get_basic_info().repetition() != Repetition::REQUIRED {
- return Err(arrow_err!("Map keys must be required"));
+ match map_key.get_basic_info().repetition() {
+ Repetition::REPEATED => {
+ return Err(arrow_err!("Map keys cannot be repeated"));
+ }
+ Repetition::REQUIRED | Repetition::OPTIONAL => {
+ // Relaxed check for having repetition REQUIRED as there exists
+ // parquet writers and files that do not conform to this
standard.
+ // This allows us to consume a broader range of existing files
even
+ // if they are out of spec.
+ }
}
if map_value.get_basic_info().repetition() == Repetition::REPEATED {
@@ -346,7 +354,11 @@ impl Visitor {
// Need both columns to be projected
match (maybe_key, maybe_value) {
(Some(key), Some(value)) => {
- let key_field = Arc::new(convert_field(map_key, &key,
arrow_key));
+ let key_field = Arc::new(
+ convert_field(map_key, &key, arrow_key)
+ // The key is always non-nullable (#5630)
+ .with_nullable(false),
+ );
let value_field = Arc::new(convert_field(map_value, &value,
arrow_value));
let field_metadata = match arrow_map {
Some(field) => field.metadata().clone(),
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index e50016fd5b1..37f368b203a 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -1018,6 +1018,12 @@ mod tests {
OPTIONAL int32 value;
}
}
+ REQUIRED group my_map4 (MAP) {
+ REPEATED group map {
+ OPTIONAL binary key (UTF8);
+ REQUIRED int32 value;
+ }
+ }
}
";
@@ -1075,6 +1081,24 @@ mod tests {
));
}
+ // // Map<String, Integer> (non-compliant nullable key)
+ // group my_map (MAP_KEY_VALUE) {
+ // repeated group map {
+ // optional binary key (UTF8);
+ // required int32 value;
+ // }
+ // }
+ {
+ arrow_fields.push(Field::new_map(
+ "my_map4",
+ "map",
+ Field::new("key", DataType::Utf8, false), // The key is always
non-nullable (#5630)
+ Field::new("value", DataType::Int32, false),
+ false,
+ false,
+ ));
+ }
+
let parquet_group_type = parse_message_type(message_type).unwrap();
let parquet_schema =
SchemaDescriptor::new(Arc::new(parquet_group_type));