This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new c0e78d2c0f Remove use of deprecated dict_id in datafusion-proto 
(#14173) (#14227)
c0e78d2c0f is described below

commit c0e78d2c0fa45ab28125d08c4b29abb38d7c2051
Author: Sergey Zhukov <[email protected]>
AuthorDate: Tue Feb 11 15:06:55 2025 +0300

    Remove use of deprecated dict_id in datafusion-proto (#14173) (#14227)
    
    * Remove use of deprecated dict_id in datafusion-proto (#14173)
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * Fix issues causing GitHub checks to fail
    
    * remove accidental file
    
    * undo deletion of test in copy.slt
    
    * Fix issues causing GitHub checks to fail
    
    ---------
    
    Co-authored-by: Sergey Zhukov <[email protected]>
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../proto-common/proto/datafusion_common.proto     |  3 +-
 datafusion/proto-common/src/from_proto/mod.rs      | 55 ++++-------------
 datafusion/proto-common/src/generated/pbjson.rs    | 23 -------
 datafusion/proto-common/src/generated/prost.rs     |  4 +-
 datafusion/proto-common/src/to_proto/mod.rs        |  3 -
 .../proto/src/generated/datafusion_proto_common.rs |  4 +-
 .../proto/tests/cases/roundtrip_logical_plan.rs    | 72 ----------------------
 datafusion/sqllogictest/test_files/copy.slt        |  2 +-
 datafusion/sqllogictest/test_files/regexp.slt      |  4 +-
 9 files changed, 18 insertions(+), 152 deletions(-)

diff --git a/datafusion/proto-common/proto/datafusion_common.proto 
b/datafusion/proto-common/proto/datafusion_common.proto
index 1c2807f390..8e5d1283f8 100644
--- a/datafusion/proto-common/proto/datafusion_common.proto
+++ b/datafusion/proto-common/proto/datafusion_common.proto
@@ -108,8 +108,7 @@ message Field {
   // for complex data types like structs, unions
   repeated Field children = 4;
   map<string, string> metadata = 5;
-  int64 dict_id = 6;
-  bool dict_ordered = 7;
+  bool dict_ordered = 6;
 }
 
 message Timestamp{
diff --git a/datafusion/proto-common/src/from_proto/mod.rs 
b/datafusion/proto-common/src/from_proto/mod.rs
index b022e52b6a..93547efeb5 100644
--- a/datafusion/proto-common/src/from_proto/mod.rs
+++ b/datafusion/proto-common/src/from_proto/mod.rs
@@ -320,21 +320,8 @@ impl TryFrom<&protobuf::Field> for Field {
     type Error = Error;
     fn try_from(field: &protobuf::Field) -> Result<Self, Self::Error> {
         let datatype = field.arrow_type.as_deref().required("arrow_type")?;
-        let field = if field.dict_id != 0 {
-            // https://github.com/apache/datafusion/issues/14173
-            #[allow(deprecated)]
-            Self::new_dict(
-                field.name.as_str(),
-                datatype,
-                field.nullable,
-                field.dict_id,
-                field.dict_ordered,
-            )
-            .with_metadata(field.metadata.clone())
-        } else {
-            Self::new(field.name.as_str(), datatype, field.nullable)
-                .with_metadata(field.metadata.clone())
-        };
+        let field = Self::new(field.name.as_str(), datatype, field.nullable)
+            .with_metadata(field.metadata.clone());
         Ok(field)
     }
 }
@@ -436,36 +423,18 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
 
                     let id = dict_batch.id();
 
-                    let fields_using_this_dictionary = {
-                        // See 
https://github.com/apache/datafusion/issues/14173
-                        #[allow(deprecated)]
-                        schema.fields_with_dict_id(id)
-                    };
+                    let record_batch = read_record_batch(
+                        &buffer,
+                        dict_batch.data().unwrap(),
+                        Arc::new(schema.clone()),
+                        &Default::default(),
+                        None,
+                        &message.version(),
+                    )?;
 
-                    let first_field = 
fields_using_this_dictionary.first().ok_or_else(|| {
-                        Error::General("dictionary id not found in schema 
while deserializing ScalarValue::List".to_string())
-                    })?;
+                    let values: ArrayRef = Arc::clone(record_batch.column(0));
 
-                    let values: ArrayRef = match first_field.data_type() {
-                        DataType::Dictionary(_, ref value_type) => {
-                            // Make a fake schema for the dictionary batch.
-                            let value = value_type.as_ref().clone();
-                            let schema = Schema::new(vec![Field::new("", 
value, true)]);
-                            // Read a single column
-                            let record_batch = read_record_batch(
-                                &buffer,
-                                dict_batch.data().unwrap(),
-                                Arc::new(schema),
-                                &Default::default(),
-                                None,
-                                &message.version(),
-                            )?;
-                            Ok(Arc::clone(record_batch.column(0)))
-                        }
-                        _ => Err(Error::General("dictionary id not found in 
schema while deserializing ScalarValue::List".to_string())),
-                    }?;
-
-                    Ok((id,values))
+                    Ok((id, values))
                 }).collect::<datafusion_common::Result<HashMap<_, _>>>()?;
 
                 let record_batch = read_record_batch(
diff --git a/datafusion/proto-common/src/generated/pbjson.rs 
b/datafusion/proto-common/src/generated/pbjson.rs
index 40687de098..8c0a9041ba 100644
--- a/datafusion/proto-common/src/generated/pbjson.rs
+++ b/datafusion/proto-common/src/generated/pbjson.rs
@@ -3107,9 +3107,6 @@ impl serde::Serialize for Field {
         if !self.metadata.is_empty() {
             len += 1;
         }
-        if self.dict_id != 0 {
-            len += 1;
-        }
         if self.dict_ordered {
             len += 1;
         }
@@ -3129,11 +3126,6 @@ impl serde::Serialize for Field {
         if !self.metadata.is_empty() {
             struct_ser.serialize_field("metadata", &self.metadata)?;
         }
-        if self.dict_id != 0 {
-            #[allow(clippy::needless_borrow)]
-            #[allow(clippy::needless_borrows_for_generic_args)]
-            struct_ser.serialize_field("dictId", 
ToString::to_string(&self.dict_id).as_str())?;
-        }
         if self.dict_ordered {
             struct_ser.serialize_field("dictOrdered", &self.dict_ordered)?;
         }
@@ -3141,7 +3133,6 @@ impl serde::Serialize for Field {
     }
 }
 impl<'de> serde::Deserialize<'de> for Field {
-    #[allow(deprecated)]
     fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
@@ -3153,8 +3144,6 @@ impl<'de> serde::Deserialize<'de> for Field {
             "nullable",
             "children",
             "metadata",
-            "dict_id",
-            "dictId",
             "dict_ordered",
             "dictOrdered",
         ];
@@ -3166,7 +3155,6 @@ impl<'de> serde::Deserialize<'de> for Field {
             Nullable,
             Children,
             Metadata,
-            DictId,
             DictOrdered,
         }
         impl<'de> serde::Deserialize<'de> for GeneratedField {
@@ -3194,7 +3182,6 @@ impl<'de> serde::Deserialize<'de> for Field {
                             "nullable" => Ok(GeneratedField::Nullable),
                             "children" => Ok(GeneratedField::Children),
                             "metadata" => Ok(GeneratedField::Metadata),
-                            "dictId" | "dict_id" => Ok(GeneratedField::DictId),
                             "dictOrdered" | "dict_ordered" => 
Ok(GeneratedField::DictOrdered),
                             _ => Err(serde::de::Error::unknown_field(value, 
FIELDS)),
                         }
@@ -3220,7 +3207,6 @@ impl<'de> serde::Deserialize<'de> for Field {
                 let mut nullable__ = None;
                 let mut children__ = None;
                 let mut metadata__ = None;
-                let mut dict_id__ = None;
                 let mut dict_ordered__ = None;
                 while let Some(k) = map_.next_key()? {
                     match k {
@@ -3256,14 +3242,6 @@ impl<'de> serde::Deserialize<'de> for Field {
                                 map_.next_value::<std::collections::HashMap<_, 
_>>()?
                             );
                         }
-                        GeneratedField::DictId => {
-                            if dict_id__.is_some() {
-                                return 
Err(serde::de::Error::duplicate_field("dictId"));
-                            }
-                            dict_id__ = 
-                                
Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
-                            ;
-                        }
                         GeneratedField::DictOrdered => {
                             if dict_ordered__.is_some() {
                                 return 
Err(serde::de::Error::duplicate_field("dictOrdered"));
@@ -3278,7 +3256,6 @@ impl<'de> serde::Deserialize<'de> for Field {
                     nullable: nullable__.unwrap_or_default(),
                     children: children__.unwrap_or_default(),
                     metadata: metadata__.unwrap_or_default(),
-                    dict_id: dict_id__.unwrap_or_default(),
                     dict_ordered: dict_ordered__.unwrap_or_default(),
                 })
             }
diff --git a/datafusion/proto-common/src/generated/prost.rs 
b/datafusion/proto-common/src/generated/prost.rs
index 9e4a1ecb6b..db46b47efc 100644
--- a/datafusion/proto-common/src/generated/prost.rs
+++ b/datafusion/proto-common/src/generated/prost.rs
@@ -106,9 +106,7 @@ pub struct Field {
         ::prost::alloc::string::String,
         ::prost::alloc::string::String,
     >,
-    #[prost(int64, tag = "6")]
-    pub dict_id: i64,
-    #[prost(bool, tag = "7")]
+    #[prost(bool, tag = "6")]
     pub dict_ordered: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
diff --git a/datafusion/proto-common/src/to_proto/mod.rs 
b/datafusion/proto-common/src/to_proto/mod.rs
index ced1865795..83c8e98cba 100644
--- a/datafusion/proto-common/src/to_proto/mod.rs
+++ b/datafusion/proto-common/src/to_proto/mod.rs
@@ -97,9 +97,6 @@ impl TryFrom<&Field> for protobuf::Field {
             nullable: field.is_nullable(),
             children: Vec::new(),
             metadata: field.metadata().clone(),
-            #[allow(deprecated)]
-            // See https://github.com/apache/datafusion/issues/14173 to remove 
deprecated dict_id
-            dict_id: field.dict_id().unwrap_or(0),
             dict_ordered: field.dict_is_ordered().unwrap_or(false),
         })
     }
diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs 
b/datafusion/proto/src/generated/datafusion_proto_common.rs
index 9e4a1ecb6b..db46b47efc 100644
--- a/datafusion/proto/src/generated/datafusion_proto_common.rs
+++ b/datafusion/proto/src/generated/datafusion_proto_common.rs
@@ -106,9 +106,7 @@ pub struct Field {
         ::prost::alloc::string::String,
         ::prost::alloc::string::String,
     >,
-    #[prost(int64, tag = "6")]
-    pub dict_id: i64,
-    #[prost(bool, tag = "7")]
+    #[prost(bool, tag = "6")]
     pub dict_ordered: bool,
 }
 #[derive(Clone, PartialEq, ::prost::Message)]
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs 
b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index 9a60c4f306..9cc7514a0d 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -1494,20 +1494,6 @@ fn round_trip_scalar_values_and_data_types() {
                 Field::new("b", DataType::Boolean, false),
                 ScalarValue::from(false),
             )
-            .with_scalar(
-                Field::new(
-                    "c",
-                    DataType::Dictionary(
-                        Box::new(DataType::UInt16),
-                        Box::new(DataType::Utf8),
-                    ),
-                    false,
-                ),
-                ScalarValue::Dictionary(
-                    Box::new(DataType::UInt16),
-                    Box::new("value".into()),
-                ),
-            )
             .build()
             .unwrap(),
         ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
@@ -1518,25 +1504,6 @@ fn round_trip_scalar_values_and_data_types() {
         ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
             Field::new("a", DataType::Int32, true),
             Field::new("b", DataType::Boolean, false),
-            Field::new(
-                "c",
-                DataType::Dictionary(
-                    Box::new(DataType::UInt16),
-                    Box::new(DataType::Binary),
-                ),
-                false,
-            ),
-            Field::new(
-                "d",
-                DataType::new_list(
-                    DataType::Dictionary(
-                        Box::new(DataType::UInt16),
-                        Box::new(DataType::Binary),
-                    ),
-                    false,
-                ),
-                false,
-            ),
         ])))
         .unwrap(),
         ScalarValue::try_from(&DataType::Map(
@@ -1815,45 +1782,6 @@ fn round_trip_datatype() {
     }
 }
 
-// See https://github.com/apache/datafusion/issues/14173 to remove deprecated 
dict_id
-#[allow(deprecated)]
-#[test]
-fn roundtrip_dict_id() -> Result<()> {
-    let dict_id = 42;
-    let field = Field::new(
-        "keys",
-        DataType::List(Arc::new(Field::new_dict(
-            "item",
-            DataType::Dictionary(Box::new(DataType::UInt16), 
Box::new(DataType::Utf8)),
-            true,
-            dict_id,
-            false,
-        ))),
-        false,
-    );
-    let schema = Arc::new(Schema::new(vec![field]));
-
-    // encode
-    let mut buf: Vec<u8> = vec![];
-    let schema_proto: protobuf::Schema = schema.try_into().unwrap();
-    schema_proto.encode(&mut buf).unwrap();
-
-    // decode
-    let schema_proto = protobuf::Schema::decode(buf.as_slice()).unwrap();
-    let decoded: Schema = (&schema_proto).try_into()?;
-
-    // assert
-    let keys = decoded.fields().iter().last().unwrap();
-    match keys.data_type() {
-        DataType::List(field) => {
-            assert_eq!(field.dict_id(), Some(dict_id), "dict_id should be 
retained");
-        }
-        _ => panic!("Invalid type"),
-    }
-
-    Ok(())
-}
-
 #[test]
 fn roundtrip_null_scalar_values() {
     let test_types = vec![
diff --git a/datafusion/sqllogictest/test_files/copy.slt 
b/datafusion/sqllogictest/test_files/copy.slt
index cd0a38a5e0..7dd85b3ae2 100644
--- a/datafusion/sqllogictest/test_files/copy.slt
+++ b/datafusion/sqllogictest/test_files/copy.slt
@@ -558,7 +558,6 @@ select * from validate_arrow_file_dict;
 c foo
 d bar
 
-
 # Copy from table to folder of json
 query I
 COPY source_table to 'test_files/scratch/copy/table_arrow' STORED AS ARROW;
@@ -632,3 +631,4 @@ COPY source_table  to '/tmp/table.parquet' (row_group_size 
55 + 102);
 # Copy using execution.keep_partition_by_columns with an invalid value
 query error DataFusion error: Invalid or Unsupported Configuration: provided 
value for 'execution.keep_partition_by_columns' was not recognized: 
"invalid_value"
 COPY source_table  to '/tmp/table.parquet' OPTIONS 
(execution.keep_partition_by_columns invalid_value);
+
diff --git a/datafusion/sqllogictest/test_files/regexp.slt 
b/datafusion/sqllogictest/test_files/regexp.slt
index ce39434e68..44ba61e877 100644
--- a/datafusion/sqllogictest/test_files/regexp.slt
+++ b/datafusion/sqllogictest/test_files/regexp.slt
@@ -477,8 +477,8 @@ SELECT 'foo\nbar\nbaz' ~ 'bar';
 true
 
 statement error
-Error during planning: Cannot infer common argument type for regex operation 
List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata
-: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, 
dict_id: 0, dict_is_ordered: false, metadata: {} })
+Error during planning: Cannot infer common argument type for regex operation 
List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered: 
false, metadata
+: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true, 
dict_is_ordered: false, metadata: {} })
 select [1,2] ~ [3];
 
 query B


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to