This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 1e67364f81 fix: Ignore names of technical inner fields (of List and
Map types) when comparing datatypes for logical equivalence (#13522)
1e67364f81 is described below
commit 1e67364f81e434e170ee8042422b57071b3a232a
Author: Arttu <[email protected]>
AuthorDate: Mon Nov 25 22:32:47 2024 +0100
fix: Ignore names of technical inner fields (of List and Map types) when
comparing datatypes for logical equivalence (#13522)
* fix: Ignore names of technical inner fields (of List and Map types) when
comparing datatypes for logical equivalence
* apply same logic to datatype_is_semantically_equal
---
datafusion/common/src/dfschema.rs | 326 +++++++++++++++++++++++++++++++++++++-
1 file changed, 320 insertions(+), 6 deletions(-)
diff --git a/datafusion/common/src/dfschema.rs
b/datafusion/common/src/dfschema.rs
index e893cee089..45620c3cac 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -656,9 +656,26 @@ impl DFSchema {
(othertype, DataType::Dictionary(_, v1)) => v1.as_ref() ==
othertype,
(DataType::List(f1), DataType::List(f2))
| (DataType::LargeList(f1), DataType::LargeList(f2))
- | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))
- | (DataType::Map(f1, _), DataType::Map(f2, _)) => {
- Self::field_is_logically_equal(f1, f2)
+ | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))
=> {
+ // Don't compare the names of the technical inner field
+ // Usually "item" but that's not mandated
+ Self::datatype_is_logically_equal(f1.data_type(),
f2.data_type())
+ }
+ (DataType::Map(f1, _), DataType::Map(f2, _)) => {
+ // Don't compare the names of the technical inner fields
+ // Usually "entries", "key", "value" but that's not mandated
+ match (f1.data_type(), f2.data_type()) {
+ (DataType::Struct(f1_inner), DataType::Struct(f2_inner))
=> {
+ f1_inner.len() == f2_inner.len()
+ && f1_inner.iter().zip(f2_inner.iter()).all(|(f1,
f2)| {
+ Self::datatype_is_logically_equal(
+ f1.data_type(),
+ f2.data_type(),
+ )
+ })
+ }
+ _ => panic!("Map type should have an inner struct field"),
+ }
}
(DataType::Struct(fields1), DataType::Struct(fields2)) => {
let iter1 = fields1.iter();
@@ -695,9 +712,26 @@ impl DFSchema {
}
(DataType::List(f1), DataType::List(f2))
| (DataType::LargeList(f1), DataType::LargeList(f2))
- | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))
- | (DataType::Map(f1, _), DataType::Map(f2, _)) => {
- Self::field_is_semantically_equal(f1, f2)
+ | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))
=> {
+ // Don't compare the names of the technical inner field
+ // Usually "item" but that's not mandated
+ Self::datatype_is_semantically_equal(f1.data_type(),
f2.data_type())
+ }
+ (DataType::Map(f1, _), DataType::Map(f2, _)) => {
+ // Don't compare the names of the technical inner fields
+ // Usually "entries", "key", "value" but that's not mandated
+ match (f1.data_type(), f2.data_type()) {
+ (DataType::Struct(f1_inner), DataType::Struct(f2_inner))
=> {
+ f1_inner.len() == f2_inner.len()
+ && f1_inner.iter().zip(f2_inner.iter()).all(|(f1,
f2)| {
+ Self::datatype_is_semantically_equal(
+ f1.data_type(),
+ f2.data_type(),
+ )
+ })
+ }
+ _ => panic!("Map type should have an inner struct field"),
+ }
}
(DataType::Struct(fields1), DataType::Struct(fields2)) => {
let iter1 = fields1.iter();
@@ -1332,6 +1366,286 @@ mod tests {
Ok(())
}
+ #[test]
+ fn test_datatype_is_logically_equal() {
+ assert!(DFSchema::datatype_is_logically_equal(
+ &DataType::Int8,
+ &DataType::Int8
+ ));
+
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &DataType::Int8,
+ &DataType::Int16
+ ));
+
+ // Test lists
+
+ // Succeeds if both have the same element type, disregards names and
nullability
+ assert!(DFSchema::datatype_is_logically_equal(
+ &DataType::List(Field::new("item", DataType::Int8, true).into()),
+ &DataType::List(Field::new("element", DataType::Int8,
false).into())
+ ));
+
+ // Fails if element type is different
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &DataType::List(Field::new("item", DataType::Int8, true).into()),
+ &DataType::List(Field::new("item", DataType::Int16, true).into())
+ ));
+
+ // Test maps
+ let map_field = DataType::Map(
+ Field::new(
+ "entries",
+ DataType::Struct(Fields::from(vec![
+ Field::new("key", DataType::Int8, false),
+ Field::new("value", DataType::Int8, true),
+ ])),
+ true,
+ )
+ .into(),
+ true,
+ );
+
+ // Succeeds if both maps have the same key and value types, disregards
names and nullability
+ assert!(DFSchema::datatype_is_logically_equal(
+ &map_field,
+ &DataType::Map(
+ Field::new(
+ "pairs",
+ DataType::Struct(Fields::from(vec![
+ Field::new("one", DataType::Int8, false),
+ Field::new("two", DataType::Int8, false)
+ ])),
+ true
+ )
+ .into(),
+ true
+ )
+ ));
+ // Fails if value type is different
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &map_field,
+ &DataType::Map(
+ Field::new(
+ "entries",
+ DataType::Struct(Fields::from(vec![
+ Field::new("key", DataType::Int8, false),
+ Field::new("value", DataType::Int16, true)
+ ])),
+ true
+ )
+ .into(),
+ true
+ )
+ ));
+
+ // Fails if key type is different
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &map_field,
+ &DataType::Map(
+ Field::new(
+ "entries",
+ DataType::Struct(Fields::from(vec![
+ Field::new("key", DataType::Int16, false),
+ Field::new("value", DataType::Int8, true)
+ ])),
+ true
+ )
+ .into(),
+ true
+ )
+ ));
+
+ // Test structs
+
+ let struct_field = DataType::Struct(Fields::from(vec![
+ Field::new("a", DataType::Int8, true),
+ Field::new("b", DataType::Int8, true),
+ ]));
+
+ // Succeeds if both have same names and datatypes, ignores nullability
+ assert!(DFSchema::datatype_is_logically_equal(
+ &struct_field,
+ &DataType::Struct(Fields::from(vec![
+ Field::new("a", DataType::Int8, false),
+ Field::new("b", DataType::Int8, true),
+ ]))
+ ));
+
+ // Fails if field names are different
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &struct_field,
+ &DataType::Struct(Fields::from(vec![
+ Field::new("x", DataType::Int8, true),
+ Field::new("y", DataType::Int8, true),
+ ]))
+ ));
+
+ // Fails if types are different
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &struct_field,
+ &DataType::Struct(Fields::from(vec![
+ Field::new("a", DataType::Int16, true),
+ Field::new("b", DataType::Int8, true),
+ ]))
+ ));
+
+ // Fails if more or less fields
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &struct_field,
+ &DataType::Struct(Fields::from(vec![Field::new("a",
DataType::Int8, true),]))
+ ));
+ }
+
+ #[test]
+ fn test_datatype_is_logically_equivalent_to_dictionary() {
+ // Dictionary is logically equal to its value type
+ assert!(DFSchema::datatype_is_logically_equal(
+ &DataType::Utf8,
+ &DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8))
+ ));
+ }
+
+ #[test]
+ fn test_datatype_is_semantically_equal() {
+ assert!(DFSchema::datatype_is_semantically_equal(
+ &DataType::Int8,
+ &DataType::Int8
+ ));
+
+ assert!(!DFSchema::datatype_is_semantically_equal(
+ &DataType::Int8,
+ &DataType::Int16
+ ));
+
+ // Test lists
+
+ // Succeeds if both have the same element type, disregards names and
nullability
+ assert!(DFSchema::datatype_is_semantically_equal(
+ &DataType::List(Field::new("item", DataType::Int8, true).into()),
+ &DataType::List(Field::new("element", DataType::Int8,
false).into())
+ ));
+
+ // Fails if element type is different
+ assert!(!DFSchema::datatype_is_semantically_equal(
+ &DataType::List(Field::new("item", DataType::Int8, true).into()),
+ &DataType::List(Field::new("item", DataType::Int16, true).into())
+ ));
+
+ // Test maps
+ let map_field = DataType::Map(
+ Field::new(
+ "entries",
+ DataType::Struct(Fields::from(vec![
+ Field::new("key", DataType::Int8, false),
+ Field::new("value", DataType::Int8, true),
+ ])),
+ true,
+ )
+ .into(),
+ true,
+ );
+
+ // Succeeds if both maps have the same key and value types, disregards
names and nullability
+ assert!(DFSchema::datatype_is_semantically_equal(
+ &map_field,
+ &DataType::Map(
+ Field::new(
+ "pairs",
+ DataType::Struct(Fields::from(vec![
+ Field::new("one", DataType::Int8, false),
+ Field::new("two", DataType::Int8, false)
+ ])),
+ true
+ )
+ .into(),
+ true
+ )
+ ));
+ // Fails if value type is different
+ assert!(!DFSchema::datatype_is_semantically_equal(
+ &map_field,
+ &DataType::Map(
+ Field::new(
+ "entries",
+ DataType::Struct(Fields::from(vec![
+ Field::new("key", DataType::Int8, false),
+ Field::new("value", DataType::Int16, true)
+ ])),
+ true
+ )
+ .into(),
+ true
+ )
+ ));
+
+ // Fails if key type is different
+ assert!(!DFSchema::datatype_is_semantically_equal(
+ &map_field,
+ &DataType::Map(
+ Field::new(
+ "entries",
+ DataType::Struct(Fields::from(vec![
+ Field::new("key", DataType::Int16, false),
+ Field::new("value", DataType::Int8, true)
+ ])),
+ true
+ )
+ .into(),
+ true
+ )
+ ));
+
+ // Test structs
+
+ let struct_field = DataType::Struct(Fields::from(vec![
+ Field::new("a", DataType::Int8, true),
+ Field::new("b", DataType::Int8, true),
+ ]));
+
+ // Succeeds if both have same names and datatypes, ignores nullability
+ assert!(DFSchema::datatype_is_logically_equal(
+ &struct_field,
+ &DataType::Struct(Fields::from(vec![
+ Field::new("a", DataType::Int8, false),
+ Field::new("b", DataType::Int8, true),
+ ]))
+ ));
+
+ // Fails if field names are different
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &struct_field,
+ &DataType::Struct(Fields::from(vec![
+ Field::new("x", DataType::Int8, true),
+ Field::new("y", DataType::Int8, true),
+ ]))
+ ));
+
+ // Fails if types are different
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &struct_field,
+ &DataType::Struct(Fields::from(vec![
+ Field::new("a", DataType::Int16, true),
+ Field::new("b", DataType::Int8, true),
+ ]))
+ ));
+
+ // Fails if more or less fields
+ assert!(!DFSchema::datatype_is_logically_equal(
+ &struct_field,
+ &DataType::Struct(Fields::from(vec![Field::new("a",
DataType::Int8, true),]))
+ ));
+ }
+
+ #[test]
+ fn test_datatype_is_not_semantically_equivalent_to_dictionary() {
+ // Dictionary is not semantically equal to its value type
+ assert!(!DFSchema::datatype_is_semantically_equal(
+ &DataType::Utf8,
+ &DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8))
+ ));
+ }
+
fn test_schema_2() -> Schema {
Schema::new(vec![
Field::new("c100", DataType::Boolean, true),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]