This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 7293764421 Introduce `array_except` function (#8135)
7293764421 is described below
commit 729376442138f85e135b28010ca2c0d018955292
Author: Jay Zhan <[email protected]>
AuthorDate: Sat Nov 18 03:40:47 2023 +0800
Introduce `array_except` function (#8135)
* squash commits for rebase
Signed-off-by: jayzhan211 <[email protected]>
* address comment
Signed-off-by: jayzhan211 <[email protected]>
* rename
Signed-off-by: jayzhan211 <[email protected]>
* fix rebase
Signed-off-by: jayzhan211 <[email protected]>
---------
Signed-off-by: jayzhan211 <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/expr/src/built_in_function.rs | 6 ++
datafusion/expr/src/expr_fn.rs | 6 ++
datafusion/physical-expr/src/array_expressions.rs | 80 +++++++++++++++-
datafusion/physical-expr/src/functions.rs | 3 +
datafusion/proto/proto/datafusion.proto | 5 +-
datafusion/proto/src/generated/pbjson.rs | 3 +
datafusion/proto/src/generated/prost.rs | 7 +-
datafusion/proto/src/logical_plan/from_proto.rs | 27 +++---
datafusion/proto/src/logical_plan/to_proto.rs | 1 +
datafusion/sqllogictest/test_files/array.slt | 108 ++++++++++++++++++++++
docs/source/user-guide/expressions.md | 1 +
docs/source/user-guide/sql/scalar_functions.md | 38 ++++++++
12 files changed, 269 insertions(+), 16 deletions(-)
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index fc6f9c28e1..e9030ebcc0 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -180,6 +180,8 @@ pub enum BuiltinScalarFunction {
ArrayIntersect,
/// array_union
ArrayUnion,
+ /// array_except
+ ArrayExcept,
/// cardinality
Cardinality,
/// construct an array from columns
@@ -394,6 +396,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::ArrayHas => Volatility::Immutable,
BuiltinScalarFunction::ArrayDims => Volatility::Immutable,
BuiltinScalarFunction::ArrayElement => Volatility::Immutable,
+ BuiltinScalarFunction::ArrayExcept => Volatility::Immutable,
BuiltinScalarFunction::ArrayLength => Volatility::Immutable,
BuiltinScalarFunction::ArrayNdims => Volatility::Immutable,
BuiltinScalarFunction::ArrayPopFront => Volatility::Immutable,
@@ -601,6 +604,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Range => {
Ok(List(Arc::new(Field::new("item", Int64, true))))
}
+ BuiltinScalarFunction::ArrayExcept =>
Ok(input_expr_types[0].clone()),
BuiltinScalarFunction::Cardinality => Ok(UInt64),
BuiltinScalarFunction::MakeArray => match input_expr_types.len() {
0 => Ok(List(Arc::new(Field::new("item", Null, true)))),
@@ -887,6 +891,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::ArrayDims => Signature::any(1,
self.volatility()),
BuiltinScalarFunction::ArrayEmpty => Signature::any(1,
self.volatility()),
BuiltinScalarFunction::ArrayElement => Signature::any(2,
self.volatility()),
+ BuiltinScalarFunction::ArrayExcept => Signature::any(2,
self.volatility()),
BuiltinScalarFunction::Flatten => Signature::any(1,
self.volatility()),
BuiltinScalarFunction::ArrayHasAll
| BuiltinScalarFunction::ArrayHasAny
@@ -1521,6 +1526,7 @@ fn aliases(func: &BuiltinScalarFunction) -> &'static
[&'static str] {
"list_element",
"list_extract",
],
+ BuiltinScalarFunction::ArrayExcept => &["array_except", "list_except"],
BuiltinScalarFunction::Flatten => &["flatten"],
BuiltinScalarFunction::ArrayHasAll => &["array_has_all",
"list_has_all"],
BuiltinScalarFunction::ArrayHasAny => &["array_has_any",
"list_has_any"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 75b7628044..674d2a34df 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -640,6 +640,12 @@ scalar_expr!(
array element,
"extracts the element with the index n from the array."
);
+scalar_expr!(
+ ArrayExcept,
+ array_except,
+ first_array second_array,
+ "Returns an array of the elements that appear in the first array but not
in the second."
+);
scalar_expr!(
ArrayLength,
array_length,
diff --git a/datafusion/physical-expr/src/array_expressions.rs
b/datafusion/physical-expr/src/array_expressions.rs
index c5e8b0e75c..8bb70c3168 100644
--- a/datafusion/physical-expr/src/array_expressions.rs
+++ b/datafusion/physical-expr/src/array_expressions.rs
@@ -18,6 +18,7 @@
//! Array expressions
use std::any::type_name;
+use std::collections::HashSet;
use std::sync::Arc;
use arrow::array::*;
@@ -38,7 +39,6 @@ use datafusion_common::{
};
use itertools::Itertools;
-use std::collections::HashSet;
macro_rules! downcast_arg {
($ARG:expr, $ARRAY_TYPE:ident) => {{
@@ -523,6 +523,84 @@ pub fn array_element(args: &[ArrayRef]) ->
Result<ArrayRef> {
define_array_slice(list_array, key, key, true)
}
+fn general_except<OffsetSize: OffsetSizeTrait>(
+ l: &GenericListArray<OffsetSize>,
+ r: &GenericListArray<OffsetSize>,
+ field: &FieldRef,
+) -> Result<GenericListArray<OffsetSize>> {
+ let converter = RowConverter::new(vec![SortField::new(l.value_type())])?;
+
+ let l_values = l.values().to_owned();
+ let r_values = r.values().to_owned();
+ let l_values = converter.convert_columns(&[l_values])?;
+ let r_values = converter.convert_columns(&[r_values])?;
+
+ let mut offsets = Vec::<OffsetSize>::with_capacity(l.len() + 1);
+ offsets.push(OffsetSize::usize_as(0));
+
+ let mut rows = Vec::with_capacity(l_values.num_rows());
+ let mut dedup = HashSet::new();
+
+ for (l_w, r_w) in l.offsets().windows(2).zip(r.offsets().windows(2)) {
+ let l_slice = l_w[0].as_usize()..l_w[1].as_usize();
+ let r_slice = r_w[0].as_usize()..r_w[1].as_usize();
+ for i in r_slice {
+ let right_row = r_values.row(i);
+ dedup.insert(right_row);
+ }
+ for i in l_slice {
+ let left_row = l_values.row(i);
+ if dedup.insert(left_row) {
+ rows.push(left_row);
+ }
+ }
+
+ offsets.push(OffsetSize::usize_as(rows.len()));
+ dedup.clear();
+ }
+
+ if let Some(values) = converter.convert_rows(rows)?.get(0) {
+ Ok(GenericListArray::<OffsetSize>::new(
+ field.to_owned(),
+ OffsetBuffer::new(offsets.into()),
+ values.to_owned(),
+ l.nulls().cloned(),
+ ))
+ } else {
+ internal_err!("array_except failed to convert rows")
+ }
+}
+
+pub fn array_except(args: &[ArrayRef]) -> Result<ArrayRef> {
+ if args.len() != 2 {
+ return internal_err!("array_except needs two arguments");
+ }
+
+ let array1 = &args[0];
+ let array2 = &args[1];
+
+ match (array1.data_type(), array2.data_type()) {
+ (DataType::Null, _) | (_, DataType::Null) => Ok(array1.to_owned()),
+ (DataType::List(field), DataType::List(_)) => {
+ check_datatypes("array_except", &[&array1, &array2])?;
+ let list1 = array1.as_list::<i32>();
+ let list2 = array2.as_list::<i32>();
+ let result = general_except::<i32>(list1, list2, field)?;
+ Ok(Arc::new(result))
+ }
+ (DataType::LargeList(field), DataType::LargeList(_)) => {
+ check_datatypes("array_except", &[&array1, &array2])?;
+ let list1 = array1.as_list::<i64>();
+ let list2 = array2.as_list::<i64>();
+ let result = general_except::<i64>(list1, list2, field)?;
+ Ok(Arc::new(result))
+ }
+ (dt1, dt2) => {
+ internal_err!("array_except got unexpected types: {dt1:?} and
{dt2:?}")
+ }
+ }
+}
+
pub fn array_slice(args: &[ArrayRef]) -> Result<ArrayRef> {
let list_array = as_list_array(&args[0])?;
let key = as_int64_array(&args[1])?;
diff --git a/datafusion/physical-expr/src/functions.rs
b/datafusion/physical-expr/src/functions.rs
index b46249d26d..5a1a68dd21 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -350,6 +350,9 @@ pub fn create_physical_fun(
BuiltinScalarFunction::ArrayElement => {
Arc::new(|args|
make_scalar_function(array_expressions::array_element)(args))
}
+ BuiltinScalarFunction::ArrayExcept => {
+ Arc::new(|args|
make_scalar_function(array_expressions::array_except)(args))
+ }
BuiltinScalarFunction::ArrayLength => {
Arc::new(|args|
make_scalar_function(array_expressions::array_length)(args))
}
diff --git a/datafusion/proto/proto/datafusion.proto
b/datafusion/proto/proto/datafusion.proto
index 8cab62acde..ad83ea1fce 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -638,8 +638,9 @@ enum ScalarFunction {
ArrayUnion = 120;
OverLay = 121;
Range = 122;
- ArrayPopFront = 123;
- Levenshtein = 124;
+ ArrayExcept = 123;
+ ArrayPopFront = 124;
+ Levenshtein = 125;
}
message ScalarFunctionNode {
diff --git a/datafusion/proto/src/generated/pbjson.rs
b/datafusion/proto/src/generated/pbjson.rs
index c50571dca0..016719a600 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -20937,6 +20937,7 @@ impl serde::Serialize for ScalarFunction {
Self::ArrayUnion => "ArrayUnion",
Self::OverLay => "OverLay",
Self::Range => "Range",
+ Self::ArrayExcept => "ArrayExcept",
Self::ArrayPopFront => "ArrayPopFront",
Self::Levenshtein => "Levenshtein",
};
@@ -21073,6 +21074,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"ArrayUnion",
"OverLay",
"Range",
+ "ArrayExcept",
"ArrayPopFront",
"Levenshtein",
];
@@ -21238,6 +21240,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"ArrayUnion" => Ok(ScalarFunction::ArrayUnion),
"OverLay" => Ok(ScalarFunction::OverLay),
"Range" => Ok(ScalarFunction::Range),
+ "ArrayExcept" => Ok(ScalarFunction::ArrayExcept),
"ArrayPopFront" => Ok(ScalarFunction::ArrayPopFront),
"Levenshtein" => Ok(ScalarFunction::Levenshtein),
_ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
diff --git a/datafusion/proto/src/generated/prost.rs
b/datafusion/proto/src/generated/prost.rs
index 213be1c395..647f814fda 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2571,8 +2571,9 @@ pub enum ScalarFunction {
ArrayUnion = 120,
OverLay = 121,
Range = 122,
- ArrayPopFront = 123,
- Levenshtein = 124,
+ ArrayExcept = 123,
+ ArrayPopFront = 124,
+ Levenshtein = 125,
}
impl ScalarFunction {
/// String value of the enum field names used in the ProtoBuf definition.
@@ -2704,6 +2705,7 @@ impl ScalarFunction {
ScalarFunction::ArrayUnion => "ArrayUnion",
ScalarFunction::OverLay => "OverLay",
ScalarFunction::Range => "Range",
+ ScalarFunction::ArrayExcept => "ArrayExcept",
ScalarFunction::ArrayPopFront => "ArrayPopFront",
ScalarFunction::Levenshtein => "Levenshtein",
}
@@ -2834,6 +2836,7 @@ impl ScalarFunction {
"ArrayUnion" => Some(Self::ArrayUnion),
"OverLay" => Some(Self::OverLay),
"Range" => Some(Self::Range),
+ "ArrayExcept" => Some(Self::ArrayExcept),
"ArrayPopFront" => Some(Self::ArrayPopFront),
"Levenshtein" => Some(Self::Levenshtein),
_ => None,
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs
b/datafusion/proto/src/logical_plan/from_proto.rs
index a34b1b7beb..f59a59f3c0 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -41,13 +41,13 @@ use datafusion_common::{
};
use datafusion_expr::{
abs, acos, acosh, array, array_append, array_concat, array_dims,
array_element,
- array_has, array_has_all, array_has_any, array_intersect, array_length,
array_ndims,
- array_position, array_positions, array_prepend, array_remove,
array_remove_all,
- array_remove_n, array_repeat, array_replace, array_replace_all,
array_replace_n,
- array_slice, array_to_string, arrow_typeof, ascii, asin, asinh, atan,
atan2, atanh,
- bit_length, btrim, cardinality, cbrt, ceil, character_length, chr,
coalesce,
- concat_expr, concat_ws_expr, cos, cosh, cot, current_date, current_time,
date_bin,
- date_part, date_trunc, decode, degrees, digest, encode, exp,
+ array_except, array_has, array_has_all, array_has_any, array_intersect,
array_length,
+ array_ndims, array_position, array_positions, array_prepend, array_remove,
+ array_remove_all, array_remove_n, array_repeat, array_replace,
array_replace_all,
+ array_replace_n, array_slice, array_to_string, arrow_typeof, ascii, asin,
asinh,
+ atan, atan2, atanh, bit_length, btrim, cardinality, cbrt, ceil,
character_length,
+ chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, current_date,
+ current_time, date_bin, date_part, date_trunc, decode, degrees, digest,
encode, exp,
expr::{self, InList, Sort, WindowFunction},
factorial, flatten, floor, from_unixtime, gcd, gen_range, isnan, iszero,
lcm, left,
levenshtein, ln, log, log10, log2,
@@ -465,6 +465,7 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::ArrayAppend => Self::ArrayAppend,
ScalarFunction::ArrayConcat => Self::ArrayConcat,
ScalarFunction::ArrayEmpty => Self::ArrayEmpty,
+ ScalarFunction::ArrayExcept => Self::ArrayExcept,
ScalarFunction::ArrayHasAll => Self::ArrayHasAll,
ScalarFunction::ArrayHasAny => Self::ArrayHasAny,
ScalarFunction::ArrayHas => Self::ArrayHas,
@@ -1352,6 +1353,10 @@ pub fn parse_expr(
.map(|expr| parse_expr(expr, registry))
.collect::<Result<Vec<_>, _>>()?,
)),
+ ScalarFunction::ArrayExcept => Ok(array_except(
+ parse_expr(&args[0], registry)?,
+ parse_expr(&args[1], registry)?,
+ )),
ScalarFunction::ArrayHasAll => Ok(array_has_all(
parse_expr(&args[0], registry)?,
parse_expr(&args[1], registry)?,
@@ -1364,6 +1369,10 @@ pub fn parse_expr(
parse_expr(&args[0], registry)?,
parse_expr(&args[1], registry)?,
)),
+ ScalarFunction::ArrayIntersect => Ok(array_intersect(
+ parse_expr(&args[0], registry)?,
+ parse_expr(&args[1], registry)?,
+ )),
ScalarFunction::ArrayPosition => Ok(array_position(
parse_expr(&args[0], registry)?,
parse_expr(&args[1], registry)?,
@@ -1415,10 +1424,6 @@ pub fn parse_expr(
parse_expr(&args[0], registry)?,
parse_expr(&args[1], registry)?,
)),
- ScalarFunction::ArrayIntersect => Ok(array_intersect(
- parse_expr(&args[0], registry)?,
- parse_expr(&args[1], registry)?,
- )),
ScalarFunction::Range => Ok(gen_range(
args.to_owned()
.iter()
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs
b/datafusion/proto/src/logical_plan/to_proto.rs
index 433c99403e..8bf4258236 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1476,6 +1476,7 @@ impl TryFrom<&BuiltinScalarFunction> for
protobuf::ScalarFunction {
BuiltinScalarFunction::ArrayAppend => Self::ArrayAppend,
BuiltinScalarFunction::ArrayConcat => Self::ArrayConcat,
BuiltinScalarFunction::ArrayEmpty => Self::ArrayEmpty,
+ BuiltinScalarFunction::ArrayExcept => Self::ArrayExcept,
BuiltinScalarFunction::ArrayHasAll => Self::ArrayHasAll,
BuiltinScalarFunction::ArrayHasAny => Self::ArrayHasAny,
BuiltinScalarFunction::ArrayHas => Self::ArrayHas,
diff --git a/datafusion/sqllogictest/test_files/array.slt
b/datafusion/sqllogictest/test_files/array.slt
index 99ed948836..61f190e7ba 100644
--- a/datafusion/sqllogictest/test_files/array.slt
+++ b/datafusion/sqllogictest/test_files/array.slt
@@ -2734,6 +2734,114 @@ select generate_series(5),
----
[0, 1, 2, 3, 4] [2, 3, 4] [2, 5, 8]
+## array_except
+
+statement ok
+CREATE TABLE array_except_table
+AS VALUES
+ ([1, 2, 2, 3], [2, 3, 4]),
+ ([2, 3, 3], [3]),
+ ([3], [3, 3, 4]),
+ (null, [3, 4]),
+ ([1, 2], null),
+ (null, null)
+;
+
+query ?
+select array_except(column1, column2) from array_except_table;
+----
+[1]
+[2]
+[]
+NULL
+[1, 2]
+NULL
+
+statement ok
+drop table array_except_table;
+
+statement ok
+CREATE TABLE array_except_nested_list_table
+AS VALUES
+ ([[1, 2], [3]], [[2], [3], [4, 5]]),
+ ([[1, 2], [3]], [[2], [1, 2]]),
+ ([[1, 2], [3]], null),
+ (null, [[1], [2, 3], [4, 5, 6]]),
+ ([[1], [2, 3], [4, 5, 6]], [[2, 3], [4, 5, 6], [1]])
+;
+
+query ?
+select array_except(column1, column2) from array_except_nested_list_table;
+----
+[[1, 2]]
+[[3]]
+[[1, 2], [3]]
+NULL
+[]
+
+statement ok
+drop table array_except_nested_list_table;
+
+statement ok
+CREATE TABLE array_except_table_float
+AS VALUES
+ ([1.1, 2.2, 3.3], [2.2]),
+ ([1.1, 2.2, 3.3], [4.4]),
+ ([1.1, 2.2, 3.3], [3.3, 2.2, 1.1])
+;
+
+query ?
+select array_except(column1, column2) from array_except_table_float;
+----
+[1.1, 3.3]
+[1.1, 2.2, 3.3]
+[]
+
+statement ok
+drop table array_except_table_float;
+
+statement ok
+CREATE TABLE array_except_table_ut8
+AS VALUES
+ (['a', 'b', 'c'], ['a']),
+ (['a', 'bc', 'def'], ['g', 'def']),
+ (['a', 'bc', 'def'], null),
+ (null, ['a'])
+;
+
+query ?
+select array_except(column1, column2) from array_except_table_ut8;
+----
+[b, c]
+[a, bc]
+[a, bc, def]
+NULL
+
+statement ok
+drop table array_except_table_ut8;
+
+statement ok
+CREATE TABLE array_except_table_bool
+AS VALUES
+ ([true, false, false], [false]),
+ ([true, true, true], [false]),
+ ([false, false, false], [true]),
+ ([true, false], null),
+ (null, [true, false])
+;
+
+query ?
+select array_except(column1, column2) from array_except_table_bool;
+----
+[true]
+[true]
+[false]
+[true, false]
+NULL
+
+statement ok
+drop table array_except_table_bool;
+
### Array operators tests
diff --git a/docs/source/user-guide/expressions.md
b/docs/source/user-guide/expressions.md
index 191ef6cd91..257c50dfa4 100644
--- a/docs/source/user-guide/expressions.md
+++ b/docs/source/user-guide/expressions.md
@@ -235,6 +235,7 @@ Unlike to some databases the math functions in Datafusion
works the same way as
| array_to_string(array, delimiter) | Converts each element to its text
representation. `array_to_string([1, 2, 3, 4], ',') -> 1,2,3,4`
|
| array_intersect(array1, array2) | Returns an array of the elements in
the intersection of array1 and array2. `array_intersect([1, 2, 3, 4], [5, 6, 3,
4]) -> [3, 4]` |
| array_union(array1, array2) | Returns an array of the elements in
the union of array1 and array2 without duplicates. `array_union([1, 2, 3, 4],
[5, 6, 3, 4]) -> [1, 2, 3, 4, 5, 6]` |
+| array_except(array1, array2) | Returns an array of the elements
that appear in the first array but not in the second. `array_except([1, 2, 3,
4], [5, 6, 3, 4]) -> [3, 4]` |
| cardinality(array) | Returns the total number of elements
in the array. `cardinality([[1, 2, 3], [4, 5, 6]]) -> 6`
|
| make_array(value1, [value2 [, ...]]) | Returns an Arrow array using the
specified input expressions. `make_array(1, 2, 3) -> [1, 2, 3]`
|
| range(start [, stop, step]) | Returns an Arrow array between start
and stop with step. `SELECT range(2, 10, 3) -> [2, 5, 8]`
|
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index f9f45a1b0a..eda46ef8a7 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -2307,6 +2307,44 @@ array_union(array1, array2)
- list_union
+### `array_except`
+
+Returns an array of the elements that appear in the first array but not in the
second.
+
+```
+array_except(array1, array2)
+```
+
+#### Arguments
+
+- **array1**: Array expression.
+ Can be a constant, column, or function, and any combination of array
operators.
+- **array2**: Array expression.
+ Can be a constant, column, or function, and any combination of array
operators.
+
+#### Example
+
+```
+❯ select array_except([1, 2, 3, 4], [5, 6, 3, 4]);
++----------------------------------------------------+
+| array_except([1, 2, 3, 4], [5, 6, 3, 4]); |
++----------------------------------------------------+
+| [1, 2] |
++----------------------------------------------------+
+❯ select array_except([1, 2, 3, 4], [3, 4, 5, 6]);
++----------------------------------------------------+
+| array_except([1, 2, 3, 4], [3, 4, 5, 6]); |
++----------------------------------------------------+
+| [3, 4] |
++----------------------------------------------------+
+```
+
+---
+
+#### Aliases
+
+- list_except
+
### `cardinality`
Returns the total number of elements in the array.