This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new c0e78d2c0f Remove use of deprecated dict_id in datafusion-proto
(#14173) (#14227)
c0e78d2c0f is described below
commit c0e78d2c0fa45ab28125d08c4b29abb38d7c2051
Author: Sergey Zhukov <[email protected]>
AuthorDate: Tue Feb 11 15:06:55 2025 +0300
Remove use of deprecated dict_id in datafusion-proto (#14173) (#14227)
* Remove use of deprecated dict_id in datafusion-proto (#14173)
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* Fix issues causing GitHub checks to fail
* remove accidental file
* undo deletion of test in copy.slt
* Fix issues causing GitHub checks to fail
---------
Co-authored-by: Sergey Zhukov <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
.../proto-common/proto/datafusion_common.proto | 3 +-
datafusion/proto-common/src/from_proto/mod.rs | 55 ++++-------------
datafusion/proto-common/src/generated/pbjson.rs | 23 -------
datafusion/proto-common/src/generated/prost.rs | 4 +-
datafusion/proto-common/src/to_proto/mod.rs | 3 -
.../proto/src/generated/datafusion_proto_common.rs | 4 +-
.../proto/tests/cases/roundtrip_logical_plan.rs | 72 ----------------------
datafusion/sqllogictest/test_files/copy.slt | 2 +-
datafusion/sqllogictest/test_files/regexp.slt | 4 +-
9 files changed, 18 insertions(+), 152 deletions(-)
diff --git a/datafusion/proto-common/proto/datafusion_common.proto
b/datafusion/proto-common/proto/datafusion_common.proto
index 1c2807f390..8e5d1283f8 100644
--- a/datafusion/proto-common/proto/datafusion_common.proto
+++ b/datafusion/proto-common/proto/datafusion_common.proto
@@ -108,8 +108,7 @@ message Field {
// for complex data types like structs, unions
repeated Field children = 4;
map<string, string> metadata = 5;
- int64 dict_id = 6;
- bool dict_ordered = 7;
+ bool dict_ordered = 6;
}
message Timestamp{
diff --git a/datafusion/proto-common/src/from_proto/mod.rs
b/datafusion/proto-common/src/from_proto/mod.rs
index b022e52b6a..93547efeb5 100644
--- a/datafusion/proto-common/src/from_proto/mod.rs
+++ b/datafusion/proto-common/src/from_proto/mod.rs
@@ -320,21 +320,8 @@ impl TryFrom<&protobuf::Field> for Field {
type Error = Error;
fn try_from(field: &protobuf::Field) -> Result<Self, Self::Error> {
let datatype = field.arrow_type.as_deref().required("arrow_type")?;
- let field = if field.dict_id != 0 {
- // https://github.com/apache/datafusion/issues/14173
- #[allow(deprecated)]
- Self::new_dict(
- field.name.as_str(),
- datatype,
- field.nullable,
- field.dict_id,
- field.dict_ordered,
- )
- .with_metadata(field.metadata.clone())
- } else {
- Self::new(field.name.as_str(), datatype, field.nullable)
- .with_metadata(field.metadata.clone())
- };
+ let field = Self::new(field.name.as_str(), datatype, field.nullable)
+ .with_metadata(field.metadata.clone());
Ok(field)
}
}
@@ -436,36 +423,18 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
let id = dict_batch.id();
- let fields_using_this_dictionary = {
- // See
https://github.com/apache/datafusion/issues/14173
- #[allow(deprecated)]
- schema.fields_with_dict_id(id)
- };
+ let record_batch = read_record_batch(
+ &buffer,
+ dict_batch.data().unwrap(),
+ Arc::new(schema.clone()),
+ &Default::default(),
+ None,
+ &message.version(),
+ )?;
- let first_field =
fields_using_this_dictionary.first().ok_or_else(|| {
- Error::General("dictionary id not found in schema
while deserializing ScalarValue::List".to_string())
- })?;
+ let values: ArrayRef = Arc::clone(record_batch.column(0));
- let values: ArrayRef = match first_field.data_type() {
- DataType::Dictionary(_, ref value_type) => {
- // Make a fake schema for the dictionary batch.
- let value = value_type.as_ref().clone();
- let schema = Schema::new(vec![Field::new("",
value, true)]);
- // Read a single column
- let record_batch = read_record_batch(
- &buffer,
- dict_batch.data().unwrap(),
- Arc::new(schema),
- &Default::default(),
- None,
- &message.version(),
- )?;
- Ok(Arc::clone(record_batch.column(0)))
- }
- _ => Err(Error::General("dictionary id not found in
schema while deserializing ScalarValue::List".to_string())),
- }?;
-
- Ok((id,values))
+ Ok((id, values))
}).collect::<datafusion_common::Result<HashMap<_, _>>>()?;
let record_batch = read_record_batch(
diff --git a/datafusion/proto-common/src/generated/pbjson.rs
b/datafusion/proto-common/src/generated/pbjson.rs
index 40687de098..8c0a9041ba 100644
--- a/datafusion/proto-common/src/generated/pbjson.rs
+++ b/datafusion/proto-common/src/generated/pbjson.rs
@@ -3107,9 +3107,6 @@ impl serde::Serialize for Field {
if !self.metadata.is_empty() {
len += 1;
}
- if self.dict_id != 0 {
- len += 1;
- }
if self.dict_ordered {
len += 1;
}
@@ -3129,11 +3126,6 @@ impl serde::Serialize for Field {
if !self.metadata.is_empty() {
struct_ser.serialize_field("metadata", &self.metadata)?;
}
- if self.dict_id != 0 {
- #[allow(clippy::needless_borrow)]
- #[allow(clippy::needless_borrows_for_generic_args)]
- struct_ser.serialize_field("dictId",
ToString::to_string(&self.dict_id).as_str())?;
- }
if self.dict_ordered {
struct_ser.serialize_field("dictOrdered", &self.dict_ordered)?;
}
@@ -3141,7 +3133,6 @@ impl serde::Serialize for Field {
}
}
impl<'de> serde::Deserialize<'de> for Field {
- #[allow(deprecated)]
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
@@ -3153,8 +3144,6 @@ impl<'de> serde::Deserialize<'de> for Field {
"nullable",
"children",
"metadata",
- "dict_id",
- "dictId",
"dict_ordered",
"dictOrdered",
];
@@ -3166,7 +3155,6 @@ impl<'de> serde::Deserialize<'de> for Field {
Nullable,
Children,
Metadata,
- DictId,
DictOrdered,
}
impl<'de> serde::Deserialize<'de> for GeneratedField {
@@ -3194,7 +3182,6 @@ impl<'de> serde::Deserialize<'de> for Field {
"nullable" => Ok(GeneratedField::Nullable),
"children" => Ok(GeneratedField::Children),
"metadata" => Ok(GeneratedField::Metadata),
- "dictId" | "dict_id" => Ok(GeneratedField::DictId),
"dictOrdered" | "dict_ordered" =>
Ok(GeneratedField::DictOrdered),
_ => Err(serde::de::Error::unknown_field(value,
FIELDS)),
}
@@ -3220,7 +3207,6 @@ impl<'de> serde::Deserialize<'de> for Field {
let mut nullable__ = None;
let mut children__ = None;
let mut metadata__ = None;
- let mut dict_id__ = None;
let mut dict_ordered__ = None;
while let Some(k) = map_.next_key()? {
match k {
@@ -3256,14 +3242,6 @@ impl<'de> serde::Deserialize<'de> for Field {
map_.next_value::<std::collections::HashMap<_,
_>>()?
);
}
- GeneratedField::DictId => {
- if dict_id__.is_some() {
- return
Err(serde::de::Error::duplicate_field("dictId"));
- }
- dict_id__ =
-
Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0)
- ;
- }
GeneratedField::DictOrdered => {
if dict_ordered__.is_some() {
return
Err(serde::de::Error::duplicate_field("dictOrdered"));
@@ -3278,7 +3256,6 @@ impl<'de> serde::Deserialize<'de> for Field {
nullable: nullable__.unwrap_or_default(),
children: children__.unwrap_or_default(),
metadata: metadata__.unwrap_or_default(),
- dict_id: dict_id__.unwrap_or_default(),
dict_ordered: dict_ordered__.unwrap_or_default(),
})
}
diff --git a/datafusion/proto-common/src/generated/prost.rs
b/datafusion/proto-common/src/generated/prost.rs
index 9e4a1ecb6b..db46b47efc 100644
--- a/datafusion/proto-common/src/generated/prost.rs
+++ b/datafusion/proto-common/src/generated/prost.rs
@@ -106,9 +106,7 @@ pub struct Field {
::prost::alloc::string::String,
::prost::alloc::string::String,
>,
- #[prost(int64, tag = "6")]
- pub dict_id: i64,
- #[prost(bool, tag = "7")]
+ #[prost(bool, tag = "6")]
pub dict_ordered: bool,
}
#[derive(Clone, PartialEq, ::prost::Message)]
diff --git a/datafusion/proto-common/src/to_proto/mod.rs
b/datafusion/proto-common/src/to_proto/mod.rs
index ced1865795..83c8e98cba 100644
--- a/datafusion/proto-common/src/to_proto/mod.rs
+++ b/datafusion/proto-common/src/to_proto/mod.rs
@@ -97,9 +97,6 @@ impl TryFrom<&Field> for protobuf::Field {
nullable: field.is_nullable(),
children: Vec::new(),
metadata: field.metadata().clone(),
- #[allow(deprecated)]
- // See https://github.com/apache/datafusion/issues/14173 to remove
deprecated dict_id
- dict_id: field.dict_id().unwrap_or(0),
dict_ordered: field.dict_is_ordered().unwrap_or(false),
})
}
diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs
b/datafusion/proto/src/generated/datafusion_proto_common.rs
index 9e4a1ecb6b..db46b47efc 100644
--- a/datafusion/proto/src/generated/datafusion_proto_common.rs
+++ b/datafusion/proto/src/generated/datafusion_proto_common.rs
@@ -106,9 +106,7 @@ pub struct Field {
::prost::alloc::string::String,
::prost::alloc::string::String,
>,
- #[prost(int64, tag = "6")]
- pub dict_id: i64,
- #[prost(bool, tag = "7")]
+ #[prost(bool, tag = "6")]
pub dict_ordered: bool,
}
#[derive(Clone, PartialEq, ::prost::Message)]
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index 9a60c4f306..9cc7514a0d 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -1494,20 +1494,6 @@ fn round_trip_scalar_values_and_data_types() {
Field::new("b", DataType::Boolean, false),
ScalarValue::from(false),
)
- .with_scalar(
- Field::new(
- "c",
- DataType::Dictionary(
- Box::new(DataType::UInt16),
- Box::new(DataType::Utf8),
- ),
- false,
- ),
- ScalarValue::Dictionary(
- Box::new(DataType::UInt16),
- Box::new("value".into()),
- ),
- )
.build()
.unwrap(),
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
@@ -1518,25 +1504,6 @@ fn round_trip_scalar_values_and_data_types() {
ScalarValue::try_from(&DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int32, true),
Field::new("b", DataType::Boolean, false),
- Field::new(
- "c",
- DataType::Dictionary(
- Box::new(DataType::UInt16),
- Box::new(DataType::Binary),
- ),
- false,
- ),
- Field::new(
- "d",
- DataType::new_list(
- DataType::Dictionary(
- Box::new(DataType::UInt16),
- Box::new(DataType::Binary),
- ),
- false,
- ),
- false,
- ),
])))
.unwrap(),
ScalarValue::try_from(&DataType::Map(
@@ -1815,45 +1782,6 @@ fn round_trip_datatype() {
}
}
-// See https://github.com/apache/datafusion/issues/14173 to remove deprecated
dict_id
-#[allow(deprecated)]
-#[test]
-fn roundtrip_dict_id() -> Result<()> {
- let dict_id = 42;
- let field = Field::new(
- "keys",
- DataType::List(Arc::new(Field::new_dict(
- "item",
- DataType::Dictionary(Box::new(DataType::UInt16),
Box::new(DataType::Utf8)),
- true,
- dict_id,
- false,
- ))),
- false,
- );
- let schema = Arc::new(Schema::new(vec![field]));
-
- // encode
- let mut buf: Vec<u8> = vec![];
- let schema_proto: protobuf::Schema = schema.try_into().unwrap();
- schema_proto.encode(&mut buf).unwrap();
-
- // decode
- let schema_proto = protobuf::Schema::decode(buf.as_slice()).unwrap();
- let decoded: Schema = (&schema_proto).try_into()?;
-
- // assert
- let keys = decoded.fields().iter().last().unwrap();
- match keys.data_type() {
- DataType::List(field) => {
- assert_eq!(field.dict_id(), Some(dict_id), "dict_id should be
retained");
- }
- _ => panic!("Invalid type"),
- }
-
- Ok(())
-}
-
#[test]
fn roundtrip_null_scalar_values() {
let test_types = vec![
diff --git a/datafusion/sqllogictest/test_files/copy.slt
b/datafusion/sqllogictest/test_files/copy.slt
index cd0a38a5e0..7dd85b3ae2 100644
--- a/datafusion/sqllogictest/test_files/copy.slt
+++ b/datafusion/sqllogictest/test_files/copy.slt
@@ -558,7 +558,6 @@ select * from validate_arrow_file_dict;
c foo
d bar
-
# Copy from table to folder of json
query I
COPY source_table to 'test_files/scratch/copy/table_arrow' STORED AS ARROW;
@@ -632,3 +631,4 @@ COPY source_table to '/tmp/table.parquet' (row_group_size
55 + 102);
# Copy using execution.keep_partition_by_columns with an invalid value
query error DataFusion error: Invalid or Unsupported Configuration: provided
value for 'execution.keep_partition_by_columns' was not recognized:
"invalid_value"
COPY source_table to '/tmp/table.parquet' OPTIONS
(execution.keep_partition_by_columns invalid_value);
+
diff --git a/datafusion/sqllogictest/test_files/regexp.slt
b/datafusion/sqllogictest/test_files/regexp.slt
index ce39434e68..44ba61e877 100644
--- a/datafusion/sqllogictest/test_files/regexp.slt
+++ b/datafusion/sqllogictest/test_files/regexp.slt
@@ -477,8 +477,8 @@ SELECT 'foo\nbar\nbaz' ~ 'bar';
true
statement error
-Error during planning: Cannot infer common argument type for regex operation
List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0,
dict_is_ordered: false, metadata
-: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true,
dict_id: 0, dict_is_ordered: false, metadata: {} })
+Error during planning: Cannot infer common argument type for regex operation
List(Field { name: "item", data_type: Int64, nullable: true, dict_is_ordered:
false, metadata
+: {} }) ~ List(Field { name: "item", data_type: Int64, nullable: true,
dict_is_ordered: false, metadata: {} })
select [1,2] ~ [3];
query B
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]