This is an automated email from the ASF dual-hosted git repository.
jayzhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new bf40113e61 fix: incorrect null handling in `range` and
`generate_series` (#9574)
bf40113e61 is described below
commit bf40113e61d247e4366382f1b8b2aa442003c89f
Author: Jonah Gao <[email protected]>
AuthorDate: Tue Mar 12 18:01:23 2024 +0800
fix: incorrect null handling in `range` and `generate_series` (#9574)
* fix: incorrect null handling in `range` and `generate_series`
* add tests
* doc
---
datafusion/functions-array/src/kernels.rs | 82 ++++++++++++++++++----------
datafusion/sqllogictest/test_files/array.slt | 66 ++++++++++++++++++++++
2 files changed, 120 insertions(+), 28 deletions(-)
diff --git a/datafusion/functions-array/src/kernels.rs
b/datafusion/functions-array/src/kernels.rs
index e5231aa594..4ac32c02af 100644
--- a/datafusion/functions-array/src/kernels.rs
+++ b/datafusion/functions-array/src/kernels.rs
@@ -23,13 +23,12 @@ use arrow::array::{
LargeStringArray, ListArray, ListBuilder, OffsetSizeTrait, StringArray,
StringBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
-use arrow::buffer::OffsetBuffer;
use arrow::compute;
-use arrow::datatypes::Field;
-use arrow::datatypes::UInt64Type;
-use arrow::datatypes::{DataType, Date32Type, IntervalMonthDayNanoType};
+use arrow::datatypes::{
+ DataType, Date32Type, Field, IntervalMonthDayNanoType, UInt64Type,
+};
use arrow::row::{RowConverter, SortField};
-use arrow_buffer::{BooleanBufferBuilder, NullBuffer};
+use arrow_buffer::{BooleanBufferBuilder, NullBuffer, OffsetBuffer};
use arrow_schema::FieldRef;
use arrow_schema::SortOptions;
@@ -394,39 +393,66 @@ pub(super) fn gen_range(args: &[ArrayRef], include_upper:
bool) -> Result<ArrayR
let mut values = vec![];
let mut offsets = vec![0];
+ let mut valid = BooleanBufferBuilder::new(stop_array.len());
for (idx, stop) in stop_array.iter().enumerate() {
- let stop = stop.unwrap_or(0);
- let start = start_array.as_ref().map(|arr|
arr.value(idx)).unwrap_or(0);
- let step = step_array.as_ref().map(|arr| arr.value(idx)).unwrap_or(1);
- if step == 0 {
- return exec_err!(
- "step can't be 0 for function {}(start [, stop, step])",
- if include_upper {
- "generate_series"
- } else {
- "range"
- }
- );
- }
- // Below, we utilize `usize` to represent steps.
- // On 32-bit targets, the absolute value of `i64` may fail to fit into
`usize`.
- let step_abs = usize::try_from(step.unsigned_abs()).map_err(|_| {
- not_impl_datafusion_err!("step {} can't fit into usize", step)
- })?;
- values.extend(
- gen_range_iter(start, stop, step < 0,
include_upper).step_by(step_abs),
- );
- offsets.push(values.len() as i32);
+ match retrieve_range_args(start_array, stop, step_array, idx) {
+ Some((_, _, 0)) => {
+ return exec_err!(
+ "step can't be 0 for function {}(start [, stop, step])",
+ if include_upper {
+ "generate_series"
+ } else {
+ "range"
+ }
+ );
+ }
+ Some((start, stop, step)) => {
+ // Below, we utilize `usize` to represent steps.
+ // On 32-bit targets, the absolute value of `i64` may fail to
fit into `usize`.
+ let step_abs =
usize::try_from(step.unsigned_abs()).map_err(|_| {
+ not_impl_datafusion_err!("step {} can't fit into usize",
step)
+ })?;
+ values.extend(
+ gen_range_iter(start, stop, step < 0, include_upper)
+ .step_by(step_abs),
+ );
+ offsets.push(values.len() as i32);
+ valid.append(true);
+ }
+ // If any of the arguments is NULL, append a NULL value to the
result.
+ None => {
+ offsets.push(values.len() as i32);
+ valid.append(false);
+ }
+ };
}
let arr = Arc::new(ListArray::try_new(
Arc::new(Field::new("item", DataType::Int64, true)),
OffsetBuffer::new(offsets.into()),
Arc::new(Int64Array::from(values)),
- None,
+ Some(NullBuffer::new(valid.finish())),
)?);
Ok(arr)
}
+/// Get the (start, stop, step) args for the range and generate_series
function.
+/// If any of the arguments is NULL, returns None.
+fn retrieve_range_args(
+ start_array: Option<&Int64Array>,
+ stop: Option<i64>,
+ step_array: Option<&Int64Array>,
+ idx: usize,
+) -> Option<(i64, i64, i64)> {
+ // Default start value is 0 if not provided
+ let start =
+ start_array.map_or(Some(0), |arr| arr.is_valid(idx).then(||
arr.value(idx)))?;
+ let stop = stop?;
+ // Default step value is 1 if not provided
+ let step =
+ step_array.map_or(Some(1), |arr| arr.is_valid(idx).then(||
arr.value(idx)))?;
+ Some((start, stop, step))
+}
+
/// Returns an iterator of i64 values from start to stop
fn gen_range_iter(
start: i64,
diff --git a/datafusion/sqllogictest/test_files/array.slt
b/datafusion/sqllogictest/test_files/array.slt
index 434fe8c959..b729e5c10f 100644
--- a/datafusion/sqllogictest/test_files/array.slt
+++ b/datafusion/sqllogictest/test_files/array.slt
@@ -5598,6 +5598,39 @@ select
----
[] [9223372036854775807] [] [9223372036854775806] [] [-9223372036854775807] []
[-9223372036854775808]
+# Test range(start, stop, step) with NULL values
+query ?
+select range(start, stop, step) from
+ (values (1), (NULL)) as start_values(start),
+ (values (10), (NULL)) as stop_values(stop),
+ (values (3), (NULL)) as step_values(step)
+where start is null or stop is null or step is null
+----
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+
+# Test range(start, stop) with NULL values
+query ?
+select range(start, stop) from
+ (values (1), (NULL)) as start_values(start),
+ (values (10), (NULL)) as stop_values(stop)
+where start is null or stop is null
+----
+NULL
+NULL
+NULL
+
+# Test range(stop) with NULL value
+query ?
+select range(NULL)
+----
+NULL
+
## should throw error
query error
select range(DATE '1992-09-01', NULL, INTERVAL '1' YEAR);
@@ -5678,6 +5711,39 @@ select
----
[9223372036854775807] [9223372036854775807] [-9223372036854775808]
[-9223372036854775808]
+# Test generate_series(start, stop, step) with NULL values
+query ?
+select generate_series(start, stop, step) from
+ (values (1), (NULL)) as start_values(start),
+ (values (10), (NULL)) as stop_values(stop),
+ (values (3), (NULL)) as step_values(step)
+where start is null or stop is null or step is null
+----
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+NULL
+
+# Test generate_series(start, stop) with NULL values
+query ?
+select generate_series(start, stop) from
+ (values (1), (NULL)) as start_values(start),
+ (values (10), (NULL)) as stop_values(stop)
+where start is null or stop is null
+----
+NULL
+NULL
+NULL
+
+# Test generate_series(stop) with NULL value
+query ?
+select generate_series(NULL)
+----
+NULL
+
## array_except