kazuyukitanimura commented on code in PR #342: URL: https://github.com/apache/datafusion-comet/pull/342#discussion_r1594749074
########## spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala: ########## @@ -1396,6 +1397,16 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde { val optExpr = scalarExprToProto("atan2", leftExpr, rightExpr) optExprWithInfo(optExpr, expr, left, right) + case e: Unhex if !isSpark32 => Review Comment: Unrelated, but potentially we can vote in the community when to deprecate 3.2 support... ########## core/src/execution/datafusion/expressions/scalar_funcs/unhex.rs: ########## @@ -0,0 +1,229 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_array::OffsetSizeTrait; +use arrow_schema::DataType; +use datafusion::logical_expr::ColumnarValue; +use datafusion_common::{cast::as_generic_string_array, exec_err, DataFusionError, ScalarValue}; + +/// Helper function to convert a hex digit to a binary value. +fn unhex_digit(c: u8) -> Result<u8, DataFusionError> { + match c { + b'0'..=b'9' => Ok(c - b'0'), + b'A'..=b'F' => Ok(10 + c - b'A'), + b'a'..=b'f' => Ok(10 + c - b'a'), + _ => Err(DataFusionError::Execution( + "Input to unhex_digit is not a valid hex digit".to_string(), + )), + } +} + +/// Convert a hex string to binary and store the result in `result`. Returns an error if the input +/// is not a valid hex string. +fn unhex(hex_str: &str, result: &mut Vec<u8>) -> Result<(), DataFusionError> { + let bytes = hex_str.as_bytes(); + + let mut i = 0; + + if (bytes.len() & 0x01) != 0 { + let v = unhex_digit(bytes[0])?; + + result.push(v); + i += 1; + } + + while i < bytes.len() { + let first = unhex_digit(bytes[i])?; + let second = unhex_digit(bytes[i + 1])?; + result.push((first << 4) | second); + + i += 2; + } + + Ok(()) +} + +fn spark_unhex_inner<T: OffsetSizeTrait>( + array: &ColumnarValue, + fail_on_error: bool, +) -> Result<ColumnarValue, DataFusionError> { + match array { + ColumnarValue::Array(array) => { + let string_array = as_generic_string_array::<T>(array)?; + + let mut encoded = Vec::new(); + let mut builder = arrow::array::BinaryBuilder::new(); + + for item in string_array.iter() { + if let Some(s) = item { + if unhex(s, &mut encoded).is_ok() { + builder.append_value(encoded.as_slice()); + encoded.clear(); + } else if fail_on_error { + return exec_err!("Input to unhex is not a valid hex string: {s}"); + } else { + builder.append_null(); Review Comment: If `unhex()` fails and `fail_on_error=false` encoded is not cleared? Is this a same behavior as Spark? It would be great if we could add a test to verify. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org