martin-g commented on code in PR #21389: URL: https://github.com/apache/datafusion/pull/21389#discussion_r3039569260
########## datafusion/functions/src/core/arrow_field.rs: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray, +}; +use arrow::datatypes::{DataType, Field, Fields}; +use datafusion_common::{Result, ScalarValue, utils::take_function_args}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion_macros::user_doc; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Other Functions"), + description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.", + syntax_example = "arrow_field(expression)", + sql_example = r#"```sql +> select arrow_field(1); ++----------------------------------------------+ +| arrow_field(Int64(1)) | ++----------------------------------------------+ +| {name: Int64(1), data_type: Int64, ...} | Review Comment: ```suggestion | {name: lit, data_type: Int64, ...} | ``` According to the .slt the field name in this case is `lit` - https://github.com/apache/datafusion/pull/21389/changes#diff-daa714d29bc8a2e23a3c9312e45808df23e144cc60b70493b5410d986f178e87R22 ########## datafusion/functions/src/core/arrow_field.rs: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray, +}; +use arrow::datatypes::{DataType, Field, Fields}; +use datafusion_common::{Result, ScalarValue, utils::take_function_args}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion_macros::user_doc; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Other Functions"), + description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.", + syntax_example = "arrow_field(expression)", + sql_example = r#"```sql +> select arrow_field(1); ++----------------------------------------------+ +| arrow_field(Int64(1)) | ++----------------------------------------------+ +| {name: Int64(1), data_type: Int64, ...} | ++----------------------------------------------+ + +> select arrow_field(1)['data_type']; ++-----------------------------------+ +| arrow_field(Int64(1))[data_type] | ++-----------------------------------+ +| Int64 | ++-----------------------------------+ +```"#, + argument( + name = "expression", + description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators." + ) +)] +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct ArrowFieldFunc { + signature: Signature, +} + +impl Default for ArrowFieldFunc { + fn default() -> Self { + Self::new() + } +} + +impl ArrowFieldFunc { + pub fn new() -> Self { + Self { + signature: Signature::any(1, Volatility::Immutable), + } + } + + fn return_struct_type() -> DataType { + DataType::Struct(Fields::from(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("data_type", DataType::Utf8, false), + Field::new("nullable", DataType::Boolean, false), + Field::new( + "metadata", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + )), + false, + ), + false, + ), + ])) + } +} + +impl ScalarUDFImpl for ArrowFieldFunc { + fn name(&self) -> &str { + "arrow_field" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + Ok(Self::return_struct_type()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { + let [_arg] = take_function_args(self.name(), args.args)?; + let field = &args.arg_fields[0]; + + // Build the name array + let name_array = + Arc::new(StringArray::from(vec![field.name().as_str()])) as Arc<dyn Array>; + + // Build the data_type array + let data_type_str = format!("{}", field.data_type()); Review Comment: ```suggestion let data_type_str = field.data_type().to_string(); ``` ########## datafusion/sqllogictest/test_files/arrow_field.slt: ########## @@ -0,0 +1,104 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# arrow_field on integer literal +query ? +SELECT arrow_field(1) +---- +{name: lit, data_type: Int64, nullable: false, metadata: {}} + +# arrow_field on null literal +query ? +SELECT arrow_field(null) +---- +{name: lit, data_type: Null, nullable: true, metadata: {}} + +# arrow_field on boolean literal +query ? +SELECT arrow_field(true) +---- +{name: lit, data_type: Boolean, nullable: false, metadata: {}} + +# arrow_field on string literal +query ? +SELECT arrow_field('foo') +---- +{name: lit, data_type: Utf8, nullable: false, metadata: {}} + +# arrow_field on float literal +query ? +SELECT arrow_field(1.0) +---- +{name: lit, data_type: Float64, nullable: false, metadata: {}} + +# arrow_field on list +query ? +SELECT arrow_field(ARRAY[1,2,3]) +---- +{name: lit, data_type: List(Int64), nullable: false, metadata: {}} Review Comment: It would be good to have tests for more complex types like Map and Struct too! Maybe a Dictionary too. ########## datafusion/functions/src/core/arrow_field.rs: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray, +}; +use arrow::datatypes::{DataType, Field, Fields}; +use datafusion_common::{Result, ScalarValue, utils::take_function_args}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion_macros::user_doc; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Other Functions"), + description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.", + syntax_example = "arrow_field(expression)", + sql_example = r#"```sql +> select arrow_field(1); ++----------------------------------------------+ +| arrow_field(Int64(1)) | ++----------------------------------------------+ +| {name: Int64(1), data_type: Int64, ...} | ++----------------------------------------------+ + +> select arrow_field(1)['data_type']; ++-----------------------------------+ +| arrow_field(Int64(1))[data_type] | ++-----------------------------------+ +| Int64 | ++-----------------------------------+ +```"#, + argument( + name = "expression", + description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators." + ) +)] +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct ArrowFieldFunc { + signature: Signature, +} + +impl Default for ArrowFieldFunc { + fn default() -> Self { + Self::new() + } +} + +impl ArrowFieldFunc { + pub fn new() -> Self { + Self { + signature: Signature::any(1, Volatility::Immutable), + } + } + + fn return_struct_type() -> DataType { + DataType::Struct(Fields::from(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("data_type", DataType::Utf8, false), + Field::new("nullable", DataType::Boolean, false), + Field::new( + "metadata", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + )), + false, + ), + false, + ), + ])) + } +} + +impl ScalarUDFImpl for ArrowFieldFunc { + fn name(&self) -> &str { + "arrow_field" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + Ok(Self::return_struct_type()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { + let [_arg] = take_function_args(self.name(), args.args)?; + let field = &args.arg_fields[0]; + + // Build the name array + let name_array = + Arc::new(StringArray::from(vec![field.name().as_str()])) as Arc<dyn Array>; + + // Build the data_type array + let data_type_str = format!("{}", field.data_type()); + let data_type_array = + Arc::new(StringArray::from(vec![data_type_str.as_str()])) as Arc<dyn Array>; + + // Build the nullable array + let nullable_array = + Arc::new(BooleanArray::from(vec![field.is_nullable()])) as Arc<dyn Array>; + + // Build the metadata map array (same pattern as arrow_metadata.rs) + let metadata = field.metadata(); + let mut map_builder = + MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); + + let mut entries: Vec<_> = metadata.iter().collect(); + entries.sort_by_key(|(k, _)| *k); + + for (k, v) in entries { + map_builder.keys().append_value(k); + map_builder.values().append_value(v); + } + map_builder.append(true)?; + + let metadata_array = Arc::new(map_builder.finish()) as Arc<dyn Array>; + + // Build the struct + let DataType::Struct(fields) = Self::return_struct_type() else { Review Comment: nit: reuse the already constructed return type: ```suggestion let &DataType::Struct(fields) = args.return_type() else { ``` ########## datafusion/functions/src/core/arrow_field.rs: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray, +}; +use arrow::datatypes::{DataType, Field, Fields}; +use datafusion_common::{Result, ScalarValue, utils::take_function_args}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion_macros::user_doc; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Other Functions"), + description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.", + syntax_example = "arrow_field(expression)", + sql_example = r#"```sql +> select arrow_field(1); ++----------------------------------------------+ +| arrow_field(Int64(1)) | ++----------------------------------------------+ +| {name: Int64(1), data_type: Int64, ...} | ++----------------------------------------------+ + +> select arrow_field(1)['data_type']; ++-----------------------------------+ +| arrow_field(Int64(1))[data_type] | ++-----------------------------------+ +| Int64 | ++-----------------------------------+ +```"#, + argument( + name = "expression", + description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators." + ) +)] +#[derive(Debug, PartialEq, Eq, Hash)] Review Comment: ```suggestion #[derive(Clone, Debug, PartialEq, Eq, Hash)] ``` Some functions derive Clone (e.g. arrow_metadata), others don't. I am not sure whether there is a rule when to derive it. ########## datafusion/functions/src/core/arrow_field.rs: ########## @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ + Array, BooleanArray, MapBuilder, StringArray, StringBuilder, StructArray, +}; +use arrow::datatypes::{DataType, Field, Fields}; +use datafusion_common::{Result, ScalarValue, utils::take_function_args}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion_macros::user_doc; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Other Functions"), + description = "Returns a struct containing the Arrow field information of the expression, including name, data type, nullability, and metadata.", + syntax_example = "arrow_field(expression)", + sql_example = r#"```sql +> select arrow_field(1); ++----------------------------------------------+ +| arrow_field(Int64(1)) | ++----------------------------------------------+ +| {name: Int64(1), data_type: Int64, ...} | ++----------------------------------------------+ + +> select arrow_field(1)['data_type']; ++-----------------------------------+ +| arrow_field(Int64(1))[data_type] | ++-----------------------------------+ +| Int64 | ++-----------------------------------+ +```"#, + argument( + name = "expression", + description = "Expression to evaluate. The expression can be a constant, column, or function, and any combination of operators." + ) +)] +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct ArrowFieldFunc { + signature: Signature, +} + +impl Default for ArrowFieldFunc { + fn default() -> Self { + Self::new() + } +} + +impl ArrowFieldFunc { + pub fn new() -> Self { + Self { + signature: Signature::any(1, Volatility::Immutable), + } + } + + fn return_struct_type() -> DataType { + DataType::Struct(Fields::from(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("data_type", DataType::Utf8, false), + Field::new("nullable", DataType::Boolean, false), + Field::new( + "metadata", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ])), + false, + )), + false, + ), + false, + ), + ])) + } +} + +impl ScalarUDFImpl for ArrowFieldFunc { + fn name(&self) -> &str { + "arrow_field" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + Ok(Self::return_struct_type()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { + let [_arg] = take_function_args(self.name(), args.args)?; + let field = &args.arg_fields[0]; Review Comment: ```suggestion let [field] = take_function_args(self.name(), args.arg_fields)?; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
