westonpace commented on code in PR #16345:
URL: https://github.com/apache/datafusion/pull/16345#discussion_r2138834078
##########
datafusion/substrait/src/logical_plan/consumer/utils.rs:
##########
@@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name(
}
}
-pub(super) fn rename_field(
+/// Traverse through the field, renaming the provided field itself and all its
inner struct fields.
+pub fn rename_field(
field: &Field,
dfs_names: &Vec<String>,
unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll
use this "c{unnamed_field_suffix}"
name_idx: &mut usize, // Index into dfs_names
- rename_self: bool, // Some fields (e.g. list items) don't have names in
Substrait and this will be false to keep old name
) -> datafusion::common::Result<Field> {
- let name = if rename_self {
- next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?
- } else {
- field.name().to_string()
- };
- match field.data_type() {
+ let name = next_struct_field_name(unnamed_field_suffix, dfs_names,
name_idx)?;
+ rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx)
+}
+
+/// Rename the field's data type but not the field itself.
+pub fn rename_fields_data_type(
+ field: Field,
+ dfs_names: &Vec<String>,
+ name_idx: &mut usize, // Index into dfs_names
+) -> datafusion::common::Result<Field> {
+ let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?;
+ Ok(field.with_data_type(dt))
+}
+
+/// Traverse through the data type (incl. lists/maps/etc), renaming all inner
struct fields.
+pub fn rename_data_type(
+ data_type: &DataType,
+ dfs_names: &Vec<String>,
+ name_idx: &mut usize, // Index into dfs_names
+) -> datafusion::common::Result<DataType> {
+ match data_type {
DataType::Struct(children) => {
let children = children
.iter()
.enumerate()
- .map(|(child_idx, f)| {
- rename_field(
- f.as_ref(),
- dfs_names,
- child_idx,
- name_idx,
- /*rename_self=*/ true,
- )
+ .map(|(field_idx, f)| {
+ rename_field(f.as_ref(), dfs_names, field_idx, name_idx)
})
.collect::<datafusion::common::Result<_>>()?;
- Ok(field
- .to_owned()
- .with_name(name)
- .with_data_type(DataType::Struct(children)))
+ Ok(DataType::Struct(children))
}
- DataType::List(inner) => {
- let renamed_inner = rename_field(
- inner.as_ref(),
+ DataType::List(inner) =>
Ok(DataType::List(Arc::new(rename_fields_data_type(
+ inner.as_ref().to_owned(),
+ dfs_names,
+ name_idx,
+ )?))),
+ DataType::LargeList(inner) => Ok(DataType::LargeList(Arc::new(
+ rename_fields_data_type(inner.as_ref().to_owned(), dfs_names,
name_idx)?,
+ ))),
+ DataType::ListView(inner) => Ok(DataType::ListView(Arc::new(
+ rename_fields_data_type(inner.as_ref().to_owned(), dfs_names,
name_idx)?,
+ ))),
+ DataType::LargeListView(inner) => Ok(DataType::LargeListView(Arc::new(
+ rename_fields_data_type(inner.as_ref().to_owned(), dfs_names,
name_idx)?,
+ ))),
+ DataType::FixedSizeList(inner, len) => Ok(DataType::FixedSizeList(
+ Arc::new(rename_fields_data_type(
+ inner.as_ref().to_owned(),
dfs_names,
- 0,
name_idx,
- /*rename_self=*/ false,
- )?;
- Ok(field
- .to_owned()
- .with_data_type(DataType::List(FieldRef::new(renamed_inner)))
- .with_name(name))
+ )?),
+ *len,
+ )),
+ DataType::Map(entries, sorted) => {
+ let entries_data_type = match entries.data_type() {
+ DataType::Struct(fields) => {
+ // This should be two fields, normally "key" and "value",
but not guaranteed
+ let fields = fields
+ .iter()
+ .map(|f| {
+ rename_fields_data_type(
+ f.as_ref().to_owned(),
+ dfs_names,
+ name_idx,
+ )
+ })
+ .collect::<datafusion::common::Result<_>>()?;
+ Ok(DataType::Struct(fields))
+ }
+ _ => exec_err!("Expected map type to contain an inner struct
type"),
+ }?;
+ Ok(DataType::Map(
+ Arc::new(
+ entries
+ .as_ref()
+ .to_owned()
+ .with_data_type(entries_data_type),
+ ),
+ *sorted,
+ ))
}
- DataType::LargeList(inner) => {
- let renamed_inner = rename_field(
- inner.as_ref(),
+ DataType::Dictionary(key_type, value_type) => {
+ // Dicts probably shouldn't contain structs, but support them just
in case one does
+ Ok(DataType::Dictionary(
+ Box::new(rename_data_type(key_type, dfs_names, name_idx)?),
+ Box::new(rename_data_type(value_type, dfs_names, name_idx)?),
+ ))
+ }
+ DataType::RunEndEncoded(run_ends_field, values_field) => {
Review Comment:
Dictionary, REE, ListView, LargeList, LargeListView, are all "encodings" and
not "types" as far as Substrait is concerned. There is some discussion of this
in
https://github.com/apache/arrow/blob/main/format/substrait/extension_types.yaml
There's nothing wrong with what you have here, and I think it's probably
fine to keep it, but if I'm being pedantic I don't think a producer should ever
produce a plan with these types.
Encodings can be expressed as variations however, in which case they will
use the base type plus a variation identifier. So maybe it is still possible
to hit this path if you are parsing one of those.
##########
datafusion/substrait/src/logical_plan/consumer/utils.rs:
##########
@@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name(
}
}
-pub(super) fn rename_field(
+/// Traverse through the field, renaming the provided field itself and all its
inner struct fields.
+pub fn rename_field(
field: &Field,
dfs_names: &Vec<String>,
unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll
use this "c{unnamed_field_suffix}"
name_idx: &mut usize, // Index into dfs_names
- rename_self: bool, // Some fields (e.g. list items) don't have names in
Substrait and this will be false to keep old name
) -> datafusion::common::Result<Field> {
- let name = if rename_self {
- next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?
- } else {
- field.name().to_string()
- };
- match field.data_type() {
+ let name = next_struct_field_name(unnamed_field_suffix, dfs_names,
name_idx)?;
+ rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx)
+}
+
+/// Rename the field's data type but not the field itself.
+pub fn rename_fields_data_type(
+ field: Field,
+ dfs_names: &Vec<String>,
+ name_idx: &mut usize, // Index into dfs_names
+) -> datafusion::common::Result<Field> {
+ let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?;
+ Ok(field.with_data_type(dt))
+}
+
+/// Traverse through the data type (incl. lists/maps/etc), renaming all inner
struct fields.
+pub fn rename_data_type(
Review Comment:
Do these two functions need to be `pub` (not a problem if yes, just curious)?
##########
datafusion/substrait/src/logical_plan/consumer/utils.rs:
##########
@@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name(
}
}
-pub(super) fn rename_field(
+/// Traverse through the field, renaming the provided field itself and all its
inner struct fields.
+pub fn rename_field(
field: &Field,
dfs_names: &Vec<String>,
unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll
use this "c{unnamed_field_suffix}"
name_idx: &mut usize, // Index into dfs_names
- rename_self: bool, // Some fields (e.g. list items) don't have names in
Substrait and this will be false to keep old name
) -> datafusion::common::Result<Field> {
- let name = if rename_self {
- next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?
- } else {
- field.name().to_string()
- };
- match field.data_type() {
+ let name = next_struct_field_name(unnamed_field_suffix, dfs_names,
name_idx)?;
+ rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx)
+}
+
+/// Rename the field's data type but not the field itself.
+pub fn rename_fields_data_type(
+ field: Field,
+ dfs_names: &Vec<String>,
+ name_idx: &mut usize, // Index into dfs_names
+) -> datafusion::common::Result<Field> {
+ let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?;
+ Ok(field.with_data_type(dt))
+}
+
+/// Traverse through the data type (incl. lists/maps/etc), renaming all inner
struct fields.
+pub fn rename_data_type(
+ data_type: &DataType,
+ dfs_names: &Vec<String>,
+ name_idx: &mut usize, // Index into dfs_names
+) -> datafusion::common::Result<DataType> {
+ match data_type {
DataType::Struct(children) => {
let children = children
.iter()
.enumerate()
- .map(|(child_idx, f)| {
- rename_field(
- f.as_ref(),
- dfs_names,
- child_idx,
- name_idx,
- /*rename_self=*/ true,
- )
+ .map(|(field_idx, f)| {
+ rename_field(f.as_ref(), dfs_names, field_idx, name_idx)
})
.collect::<datafusion::common::Result<_>>()?;
- Ok(field
- .to_owned()
- .with_name(name)
- .with_data_type(DataType::Struct(children)))
+ Ok(DataType::Struct(children))
}
- DataType::List(inner) => {
- let renamed_inner = rename_field(
- inner.as_ref(),
+ DataType::List(inner) =>
Ok(DataType::List(Arc::new(rename_fields_data_type(
+ inner.as_ref().to_owned(),
+ dfs_names,
+ name_idx,
+ )?))),
+ DataType::LargeList(inner) => Ok(DataType::LargeList(Arc::new(
+ rename_fields_data_type(inner.as_ref().to_owned(), dfs_names,
name_idx)?,
+ ))),
+ DataType::ListView(inner) => Ok(DataType::ListView(Arc::new(
+ rename_fields_data_type(inner.as_ref().to_owned(), dfs_names,
name_idx)?,
+ ))),
+ DataType::LargeListView(inner) => Ok(DataType::LargeListView(Arc::new(
+ rename_fields_data_type(inner.as_ref().to_owned(), dfs_names,
name_idx)?,
+ ))),
+ DataType::FixedSizeList(inner, len) => Ok(DataType::FixedSizeList(
+ Arc::new(rename_fields_data_type(
+ inner.as_ref().to_owned(),
dfs_names,
- 0,
name_idx,
- /*rename_self=*/ false,
- )?;
- Ok(field
- .to_owned()
- .with_data_type(DataType::List(FieldRef::new(renamed_inner)))
- .with_name(name))
+ )?),
+ *len,
+ )),
+ DataType::Map(entries, sorted) => {
+ let entries_data_type = match entries.data_type() {
+ DataType::Struct(fields) => {
+ // This should be two fields, normally "key" and "value",
but not guaranteed
+ let fields = fields
Review Comment:
Ah, I got a bit confused here but I think you're doing the right thing. You
are _not_ pulling an item from `dfs_names` for the `key` or `value` entry which
is correct. You will simply reuse whatever names those fields happen to have
which is good.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]