advancedxy commented on code in PR #449: URL: https://github.com/apache/datafusion-comet/pull/449#discussion_r1610001541
########## core/src/execution/datafusion/expressions/scalar_funcs/hex.rs: ########## @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::{ + array::{as_dictionary_array, as_string_array}, + datatypes::Int32Type, +}; +use arrow_array::StringArray; +use arrow_schema::DataType; +use datafusion::logical_expr::ColumnarValue; +use datafusion_common::{ + cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array}, + exec_err, DataFusionError, ScalarValue, +}; +use std::fmt::Write; + +fn hex_int64(num: i64) -> String { + if num >= 0 { + format!("{:X}", num) + } else { + format!("{:016X}", num as u64) + } +} + +fn hex_bytes(bytes: &[u8]) -> Vec<u8> { + let length = bytes.len(); + let mut value = vec![0; length * 2]; + let mut i = 0; + while i < length { + value[i * 2] = (bytes[i] & 0xF0) >> 4; + value[i * 2 + 1] = bytes[i] & 0x0F; + i += 1; + } + value +} + +fn hex_string(s: &str) -> Vec<u8> { + hex_bytes(s.as_bytes()) +} + +fn hex_bytes_to_string(bytes: &[u8]) -> Result<String, std::fmt::Error> { + let mut hex_string = String::with_capacity(bytes.len() * 2); Review Comment: Hmm. I don't think we should double the size for string here. The bytes are already hexed bytes, aren't they? And by the way, I think hex_bytes and hex_bytes_to_string should be combined into a single function, something like: ```rust fn hex_bytes(bytes: &[u8]) -> String { let length = bytes.len(); let mut hex_string = String::with_capacity(bytes.len() * 2); let mut i = 0; while i < length { write!(&mut hex_string, "{:X}", (bytes[i] & 0xF0) >> 4).unwrap(); write!(&mut hex_string, "{:X}", bytes[i] & 0x0F).unwrap(); // or simply write // write!(&mut hex_string, "{:02X}", bytes[i]).unwrap(); i += 1; } hex_string } ``` ########## core/src/execution/datafusion/expressions/scalar_funcs/hex.rs: ########## @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::{ + array::{as_dictionary_array, as_string_array}, + datatypes::Int32Type, +}; +use arrow_array::StringArray; +use arrow_schema::DataType; +use datafusion::logical_expr::ColumnarValue; +use datafusion_common::{ + cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array}, + exec_err, DataFusionError, ScalarValue, +}; +use std::fmt::Write; + +fn hex_int64(num: i64) -> String { + if num >= 0 { + format!("{:X}", num) + } else { + format!("{:016X}", num as u64) Review Comment: is this needed? If the num is a negative i64, there's a leading 1 in its binary representation. So, there's no need to add width into the format? Please correct me if I'm wrong here, we can simply write `format!("{:X}", num)` here? ########## core/src/execution/datafusion/expressions/scalar_funcs/hex.rs: ########## @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::{ + array::{as_dictionary_array, as_string_array}, + datatypes::Int32Type, +}; +use arrow_array::StringArray; +use arrow_schema::DataType; +use datafusion::logical_expr::ColumnarValue; +use datafusion_common::{ + cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array}, + exec_err, DataFusionError, ScalarValue, +}; +use std::fmt::Write; + +fn hex_int64(num: i64) -> String { + if num >= 0 { + format!("{:X}", num) + } else { + format!("{:016X}", num as u64) + } +} + +fn hex_bytes(bytes: &[u8]) -> Vec<u8> { + let length = bytes.len(); + let mut value = vec![0; length * 2]; + let mut i = 0; + while i < length { + value[i * 2] = (bytes[i] & 0xF0) >> 4; + value[i * 2 + 1] = bytes[i] & 0x0F; + i += 1; + } + value +} + +fn hex_string(s: &str) -> Vec<u8> { + hex_bytes(s.as_bytes()) +} + +fn hex_bytes_to_string(bytes: &[u8]) -> Result<String, std::fmt::Error> { + let mut hex_string = String::with_capacity(bytes.len() * 2); + for byte in bytes { + write!(&mut hex_string, "{:X}", byte)?; + } + Ok(hex_string) +} + +pub(super) fn spark_hex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> { + if args.len() != 1 { + return Err(DataFusionError::Internal( + "hex expects exactly one argument".to_string(), + )); + } + + match &args[0] { + ColumnarValue::Array(array) => match array.data_type() { + DataType::Int64 => { + let array = as_int64_array(array)?; + + let hexed: Vec<Option<String>> = array.iter().map(|v| v.map(hex_int64)).collect(); + + let string_array = StringArray::from(hexed); + Ok(ColumnarValue::Array(Arc::new(string_array))) + } + DataType::Utf8 | DataType::LargeUtf8 => { + let array = as_string_array(array); + + let hexed: Vec<Option<String>> = array + .iter() + .map(|v| v.map(|v| hex_bytes_to_string(&hex_string(v))).transpose()) + .collect::<Result<_, _>>()?; + + let string_array = StringArray::from(hexed); + + Ok(ColumnarValue::Array(Arc::new(string_array))) + } + DataType::Binary => { + let array = as_binary_array(array)?; + + let hexed: Vec<Option<String>> = array + .iter() + .map(|v| v.map(|v| hex_bytes_to_string(&hex_bytes(v))).transpose()) + .collect::<Result<_, _>>()?; + + let string_array = StringArray::from(hexed); + + Ok(ColumnarValue::Array(Arc::new(string_array))) + } + DataType::FixedSizeBinary(_) => { + let array = as_fixed_size_binary_array(array)?; + + let hexed: Vec<Option<String>> = array + .iter() + .map(|v| v.map(|v| hex_bytes_to_string(&hex_bytes(v))).transpose()) + .collect::<Result<_, _>>()?; + + let string_array = StringArray::from(hexed); + + Ok(ColumnarValue::Array(Arc::new(string_array))) + } + DataType::Dictionary(_, value_type) if matches!(**value_type, DataType::Int64) => { Review Comment: Not totally related to this PR. But this seems a bit unexpected. I believe many other codes in scalar_funcs and datafusion doesn't handle the dictionary type specially, such as `spark_ceil`, `spark_rpad` and `spark_murmur3_hash` or other functions registered in DataFusion. If we are going to handle dictionary types, we should also update these functions too? Or we should do it in a more systematic way, such as flatten the dictionary types first. cc @sunchao @andygrove and @viirya for more inputs. ########## core/src/execution/datafusion/expressions/scalar_funcs/hex.rs: ########## @@ -0,0 +1,371 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::{ + array::{as_dictionary_array, as_string_array}, + datatypes::Int32Type, +}; +use arrow_array::StringArray; +use arrow_schema::DataType; +use datafusion::logical_expr::ColumnarValue; +use datafusion_common::{ + cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array}, + exec_err, DataFusionError, ScalarValue, +}; +use std::fmt::Write; + +fn hex_int64(num: i64) -> String { + if num >= 0 { + format!("{:X}", num) + } else { + format!("{:016X}", num as u64) + } +} + +fn hex_bytes(bytes: &[u8]) -> Vec<u8> { + let length = bytes.len(); + let mut value = vec![0; length * 2]; + let mut i = 0; + while i < length { + value[i * 2] = (bytes[i] & 0xF0) >> 4; + value[i * 2 + 1] = bytes[i] & 0x0F; + i += 1; + } + value +} + +fn hex_string(s: &str) -> Vec<u8> { + hex_bytes(s.as_bytes()) +} + +fn hex_bytes_to_string(bytes: &[u8]) -> Result<String, std::fmt::Error> { + let mut hex_string = String::with_capacity(bytes.len() * 2); + for byte in bytes { + write!(&mut hex_string, "{:X}", byte)?; + } + Ok(hex_string) +} + +pub(super) fn spark_hex(args: &[ColumnarValue]) -> Result<ColumnarValue, DataFusionError> { + if args.len() != 1 { + return Err(DataFusionError::Internal( + "hex expects exactly one argument".to_string(), + )); + } + + match &args[0] { + ColumnarValue::Array(array) => match array.data_type() { + DataType::Int64 => { + let array = as_int64_array(array)?; + + let hexed: Vec<Option<String>> = array.iter().map(|v| v.map(hex_int64)).collect(); + + let string_array = StringArray::from(hexed); + Ok(ColumnarValue::Array(Arc::new(string_array))) + } + DataType::Utf8 | DataType::LargeUtf8 => { + let array = as_string_array(array); + + let hexed: Vec<Option<String>> = array + .iter() + .map(|v| v.map(|v| hex_bytes_to_string(&hex_string(v))).transpose()) + .collect::<Result<_, _>>()?; + + let string_array = StringArray::from(hexed); Review Comment: I think it can be simplified to something like below, other places apply to this as well. Assuming `hex_string` already returns a string. ```suggestion let hexed_array: StringArray = array .iter() .map(|v| v.map(|v| hex_string(v)) .collect(); ``` you can refer src/execution/datafusion/expressions/scalar_funcs.rs:685 for similar code. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org