This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 1441269429 chore: refactor scalarvalue/encoding using available
upstream arrow-rs methods (#19797)
1441269429 is described below
commit 1441269429be417a6752317c7552ba98e30ae792
Author: Jeffrey Vo <[email protected]>
AuthorDate: Fri Jan 16 11:03:44 2026 +0900
chore: refactor scalarvalue/encoding using available upstream arrow-rs
methods (#19797)
These PRs are available to us now as part of upgrade to arrow-s 57.2.0
(#19355):
- https://github.com/apache/arrow-rs/pull/8993
- https://github.com/apache/arrow-rs/pull/9040
Make use of them in some refactorings here.
---
datafusion/common/src/scalar/mod.rs | 16 ++---
datafusion/functions/src/encoding/inner.rs | 93 ++----------------------------
2 files changed, 8 insertions(+), 101 deletions(-)
diff --git a/datafusion/common/src/scalar/mod.rs
b/datafusion/common/src/scalar/mod.rs
index f29058df8c..495f8c3b3f 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -2988,13 +2988,8 @@ impl ScalarValue {
},
ScalarValue::Utf8View(e) => match e {
Some(value) => {
- let mut builder =
-
StringViewBuilder::with_capacity(size).with_deduplicate_strings();
- // Replace with upstream arrow-rs code when available:
- // https://github.com/apache/arrow-rs/issues/9034
- for _ in 0..size {
- builder.append_value(value);
- }
+ let mut builder = StringViewBuilder::with_capacity(size);
+ builder.try_append_value_n(value, size)?;
let array = builder.finish();
Arc::new(array)
}
@@ -3012,11 +3007,8 @@ impl ScalarValue {
},
ScalarValue::BinaryView(e) => match e {
Some(value) => {
- let mut builder =
-
BinaryViewBuilder::with_capacity(size).with_deduplicate_strings();
- for _ in 0..size {
- builder.append_value(value);
- }
+ let mut builder = BinaryViewBuilder::with_capacity(size);
+ builder.try_append_value_n(value, size)?;
let array = builder.finish();
Arc::new(array)
}
diff --git a/datafusion/functions/src/encoding/inner.rs
b/datafusion/functions/src/encoding/inner.rs
index 7b72c264e5..ce7f534506 100644
--- a/datafusion/functions/src/encoding/inner.rs
+++ b/datafusion/functions/src/encoding/inner.rs
@@ -19,8 +19,8 @@
use arrow::{
array::{
- Array, ArrayRef, AsArray, BinaryArrayType, FixedSizeBinaryArray,
- GenericBinaryArray, GenericStringArray, OffsetSizeTrait,
+ Array, ArrayRef, AsArray, BinaryArrayType, GenericBinaryArray,
+ GenericStringArray, OffsetSizeTrait,
},
datatypes::DataType,
};
@@ -239,7 +239,7 @@ fn encode_array(array: &ArrayRef, encoding: Encoding) ->
Result<ColumnarValue> {
encoding.encode_array::<_, i64>(&array.as_binary::<i64>())
}
DataType::FixedSizeBinary(_) => {
- encoding.encode_fsb_array(array.as_fixed_size_binary())
+ encoding.encode_array::<_, i32>(&array.as_fixed_size_binary())
}
dt => {
internal_err!("Unexpected data type for encode: {dt}")
@@ -307,7 +307,7 @@ fn decode_array(array: &ArrayRef, encoding: Encoding) ->
Result<ColumnarValue> {
let array = array.as_fixed_size_binary();
// TODO: could we be more conservative by accounting for nulls?
let estimate = array.len().saturating_mul(*size as usize);
- encoding.decode_fsb_array(array, estimate)
+ encoding.decode_array::<_, i32>(&array, estimate)
}
dt => {
internal_err!("Unexpected data type for decode: {dt}")
@@ -404,24 +404,6 @@ impl Encoding {
}
}
- // TODO: refactor this away once
https://github.com/apache/arrow-rs/pull/8993 lands
- fn encode_fsb_array(self, array: &FixedSizeBinaryArray) ->
Result<ArrayRef> {
- match self {
- Self::Base64 => {
- let array: GenericStringArray<i32> = array
- .iter()
- .map(|x| x.map(|x| BASE64_ENGINE.encode(x)))
- .collect();
- Ok(Arc::new(array))
- }
- Self::Hex => {
- let array: GenericStringArray<i32> =
- array.iter().map(|x| x.map(hex::encode)).collect();
- Ok(Arc::new(array))
- }
- }
- }
-
// OutputOffset important to ensure Large types output Large arrays
fn decode_array<'a, InputBinaryArray, OutputOffset>(
self,
@@ -461,73 +443,6 @@ impl Encoding {
}
}
}
-
- // TODO: refactor this away once
https://github.com/apache/arrow-rs/pull/8993 lands
- fn decode_fsb_array(
- self,
- value: &FixedSizeBinaryArray,
- approx_data_size: usize,
- ) -> Result<ArrayRef> {
- fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
- // only write input / 2 bytes to buf
- let out_len = input.len() / 2;
- let buf = &mut buf[..out_len];
- hex::decode_to_slice(input, buf)
- .map_err(|e| exec_datafusion_err!("Failed to decode from hex:
{e}"))?;
- Ok(out_len)
- }
-
- fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> {
- BASE64_ENGINE
- .decode_slice(input, buf)
- .map_err(|e| exec_datafusion_err!("Failed to decode from
base64: {e}"))
- }
-
- fn delegated_decode<DecodeFunction>(
- decode: DecodeFunction,
- input: &FixedSizeBinaryArray,
- conservative_upper_bound_size: usize,
- ) -> Result<ArrayRef>
- where
- DecodeFunction: Fn(&[u8], &mut [u8]) -> Result<usize>,
- {
- let mut values = vec![0; conservative_upper_bound_size];
- let mut offsets = OffsetBufferBuilder::new(input.len());
- let mut total_bytes_decoded = 0;
- for v in input.iter() {
- if let Some(v) = v {
- let cursor = &mut values[total_bytes_decoded..];
- let decoded = decode(v, cursor)?;
- total_bytes_decoded += decoded;
- offsets.push_length(decoded);
- } else {
- offsets.push_length(0);
- }
- }
- // We reserved an upper bound size for the values buffer, but we
only use the actual size
- values.truncate(total_bytes_decoded);
- let binary_array = GenericBinaryArray::<i32>::try_new(
- offsets.finish(),
- Buffer::from_vec(values),
- input.nulls().cloned(),
- )?;
- Ok(Arc::new(binary_array))
- }
-
- match self {
- Self::Base64 => {
- let upper_bound =
base64::decoded_len_estimate(approx_data_size);
- delegated_decode(base64_decode, value, upper_bound)
- }
- Self::Hex => {
- // Calculate the upper bound for decoded byte size
- // For hex encoding, each pair of hex characters (2 bytes)
represents 1 byte when decoded
- // So the upper bound is half the length of the input values.
- let upper_bound = approx_data_size / 2;
- delegated_decode(hex_decode, value, upper_bound)
- }
- }
- }
}
fn delegated_decode<'a, DecodeFunction, InputBinaryArray, OutputOffset>(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]