This is an automated email from the ASF dual-hosted git repository. blaginin pushed a commit to branch annarose/dict-coercion in repository https://gitbox.apache.org/repos/asf/datafusion-sandbox.git
commit 57ff351566b5e33f44c1ed232b7efe290a703481 Author: Kumar Ujjawal <[email protected]> AuthorDate: Wed Feb 4 20:55:38 2026 +0530 perf: Optimize scalar fast path of to_hex function (#20112) ## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Part of https://github.com/apache/datafusion-comet/issues/2986 ## Rationale for this change `to_hex` (used by benchmark items `hex_int` / `hex_long`) previously routed evaluation through `make_scalar_function(..., vec![])`, which converts scalar inputs into size‑1 arrays before execution. This adds avoidable overhead for constant folding / scalar evaluation. <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? - Add match-based scalar fast path for integer scalars: - `Int8/16/32/64` and `UInt8/16/32/64` - Remove `make_scalar_function(..., vec![])` usage | Type | Before | After | Speedup | |------|--------|-------|---------| | `to_hex/scalar_i32` | 270.73 ns | 86.676 ns | **3.12x** | | `to_hex/scalar_i64` | 254.71 ns | 89.254 ns | **2.85x** | <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? Yes <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? No <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --- datafusion/functions/benches/to_hex.rs | 37 ++++++++++ datafusion/functions/src/string/to_hex.rs | 109 ++++++++++++++++++++-------- datafusion/sqllogictest/test_files/expr.slt | 21 ++++++ 3 files changed, 135 insertions(+), 32 deletions(-) diff --git a/datafusion/functions/benches/to_hex.rs b/datafusion/functions/benches/to_hex.rs index 1c6757a29..356c3c6c0 100644 --- a/datafusion/functions/benches/to_hex.rs +++ b/datafusion/functions/benches/to_hex.rs @@ -21,6 +21,7 @@ use arrow::array::Int64Array; use arrow::datatypes::{DataType, Field, Int32Type, Int64Type}; use arrow::util::bench_util::create_primitive_array; use criterion::{Criterion, SamplingMode, criterion_group, criterion_main}; +use datafusion_common::ScalarValue; use datafusion_common::config::ConfigOptions; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string; @@ -32,6 +33,42 @@ fn criterion_benchmark(c: &mut Criterion) { let hex = string::to_hex(); let config_options = Arc::new(ConfigOptions::default()); + c.bench_function("to_hex/scalar_i32", |b| { + let args = vec![ColumnarValue::Scalar(ScalarValue::Int32(Some(2147483647)))]; + let arg_fields = vec![Field::new("a", DataType::Int32, true).into()]; + b.iter(|| { + black_box( + hex.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: 1, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .unwrap(), + ) + }) + }); + + c.bench_function("to_hex/scalar_i64", |b| { + let args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some( + 9223372036854775807, + )))]; + let arg_fields = vec![Field::new("a", DataType::Int64, true).into()]; + b.iter(|| { + black_box( + hex.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: 1, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .unwrap(), + ) + }) + }); + for size in [1024, 4096, 8192] { let mut group = c.benchmark_group(format!("to_hex size={size}")); group.sampling_mode(SamplingMode::Flat); diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs index 891cbe254..ed8ce07b8 100644 --- a/datafusion/functions/src/string/to_hex.rs +++ b/datafusion/functions/src/string/to_hex.rs @@ -18,7 +18,6 @@ use std::any::Any; use std::sync::Arc; -use crate::utils::make_scalar_function; use arrow::array::{Array, ArrayRef, StringArray}; use arrow::buffer::{Buffer, OffsetBuffer}; use arrow::datatypes::{ @@ -26,7 +25,7 @@ use arrow::datatypes::{ Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type, }; use datafusion_common::cast::as_primitive_array; -use datafusion_common::{Result, ScalarValue, exec_err}; +use datafusion_common::{Result, ScalarValue, exec_err, internal_err}; use datafusion_expr::{ Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignatureClass, Volatility, @@ -38,11 +37,11 @@ const HEX_CHARS: &[u8; 16] = b"0123456789abcdef"; /// Converts the number to its equivalent hexadecimal representation. /// to_hex(2147483647) = '7fffffff' -fn to_hex<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef> +fn to_hex_array<T: ArrowPrimitiveType>(array: &ArrayRef) -> Result<ArrayRef> where T::Native: ToHex, { - let integer_array = as_primitive_array::<T>(&args[0])?; + let integer_array = as_primitive_array::<T>(array)?; let len = integer_array.len(); // Max hex string length: 16 chars for u64/i64 @@ -78,6 +77,14 @@ where Ok(Arc::new(result) as ArrayRef) } +#[inline] +fn to_hex_scalar<T: ToHex>(value: T) -> String { + let mut hex_buffer = [0u8; 16]; + let hex_len = value.write_hex_to_buffer(&mut hex_buffer); + // SAFETY: hex_buffer is ASCII hex digits + unsafe { std::str::from_utf8_unchecked(&hex_buffer[16 - hex_len..]).to_string() } +} + /// Trait for converting integer types to hexadecimal in a buffer trait ToHex: ArrowNativeType { /// Write hex representation to buffer and return the number of hex digits written. @@ -223,33 +230,71 @@ impl ScalarUDFImpl for ToHexFunc { } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { - match args.args[0].data_type() { - DataType::Null => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))), - DataType::Int64 => { - make_scalar_function(to_hex::<Int64Type>, vec![])(&args.args) - } - DataType::UInt64 => { - make_scalar_function(to_hex::<UInt64Type>, vec![])(&args.args) - } - DataType::Int32 => { - make_scalar_function(to_hex::<Int32Type>, vec![])(&args.args) - } - DataType::UInt32 => { - make_scalar_function(to_hex::<UInt32Type>, vec![])(&args.args) - } - DataType::Int16 => { - make_scalar_function(to_hex::<Int16Type>, vec![])(&args.args) - } - DataType::UInt16 => { - make_scalar_function(to_hex::<UInt16Type>, vec![])(&args.args) - } - DataType::Int8 => { - make_scalar_function(to_hex::<Int8Type>, vec![])(&args.args) - } - DataType::UInt8 => { - make_scalar_function(to_hex::<UInt8Type>, vec![])(&args.args) + let arg = &args.args[0]; + + match arg { + ColumnarValue::Scalar(ScalarValue::Int64(Some(v))) => Ok( + ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))), + ), + ColumnarValue::Scalar(ScalarValue::UInt64(Some(v))) => Ok( + ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))), + ), + ColumnarValue::Scalar(ScalarValue::Int32(Some(v))) => Ok( + ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))), + ), + ColumnarValue::Scalar(ScalarValue::UInt32(Some(v))) => Ok( + ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))), + ), + ColumnarValue::Scalar(ScalarValue::Int16(Some(v))) => Ok( + ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))), + ), + ColumnarValue::Scalar(ScalarValue::UInt16(Some(v))) => Ok( + ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))), + ), + ColumnarValue::Scalar(ScalarValue::Int8(Some(v))) => Ok( + ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))), + ), + ColumnarValue::Scalar(ScalarValue::UInt8(Some(v))) => Ok( + ColumnarValue::Scalar(ScalarValue::Utf8(Some(to_hex_scalar(*v)))), + ), + + // NULL scalars + ColumnarValue::Scalar(s) if s.is_null() => { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))) } - other => exec_err!("Unsupported data type {other:?} for function to_hex"), + + ColumnarValue::Array(array) => match array.data_type() { + DataType::Int64 => { + Ok(ColumnarValue::Array(to_hex_array::<Int64Type>(array)?)) + } + DataType::UInt64 => { + Ok(ColumnarValue::Array(to_hex_array::<UInt64Type>(array)?)) + } + DataType::Int32 => { + Ok(ColumnarValue::Array(to_hex_array::<Int32Type>(array)?)) + } + DataType::UInt32 => { + Ok(ColumnarValue::Array(to_hex_array::<UInt32Type>(array)?)) + } + DataType::Int16 => { + Ok(ColumnarValue::Array(to_hex_array::<Int16Type>(array)?)) + } + DataType::UInt16 => { + Ok(ColumnarValue::Array(to_hex_array::<UInt16Type>(array)?)) + } + DataType::Int8 => { + Ok(ColumnarValue::Array(to_hex_array::<Int8Type>(array)?)) + } + DataType::UInt8 => { + Ok(ColumnarValue::Array(to_hex_array::<UInt8Type>(array)?)) + } + other => exec_err!("Unsupported data type {other:?} for function to_hex"), + }, + + other => internal_err!( + "Unexpected argument type {:?} for function to_hex", + other.data_type() + ), } } @@ -288,8 +333,8 @@ mod tests { let expected = $expected; let array = <$array_type>::from(input); - let array_ref = Arc::new(array); - let hex_result = to_hex::<$arrow_type>(&[array_ref])?; + let array_ref: ArrayRef = Arc::new(array); + let hex_result = to_hex_array::<$arrow_type>(&array_ref)?; let hex_array = as_string_array(&hex_result)?; let expected_array = StringArray::from(expected); diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 90fe05815..4e078d1e6 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -725,6 +725,27 @@ SELECT to_hex(CAST(NULL AS int)) ---- NULL +query T +SELECT to_hex(0) +---- +0 + +# negative values (two's complement encoding) +query T +SELECT to_hex(-1) +---- +ffffffffffffffff + +query T +SELECT to_hex(CAST(-1 AS INT)) +---- +ffffffffffffffff + +query T +SELECT to_hex(CAST(255 AS TINYINT UNSIGNED)) +---- +ff + query T SELECT trim(' tom ') ---- --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
