This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new d9d8ddd5f7 feat: Support `array_sort`(`list_sort`) (#8279)
d9d8ddd5f7 is described below
commit d9d8ddd5f770817f325190c4c0cc02436e7777e6
Author: Asura7969 <[email protected]>
AuthorDate: Thu Dec 7 06:00:16 2023 +0800
feat: Support `array_sort`(`list_sort`) (#8279)
* Minor: Improve the document format of JoinHashMap
* list sort
* fix: example doc
* fix: ci
* fix: doc error
* fix pb
* like DuckDB function semantics
* fix ci
* fix pb
* fix: doc
* add table test
* fix: not as expected
* fix: return null
* resolve conflicts
* doc
* merge
---
datafusion/expr/src/built_in_function.rs | 8 +++
datafusion/expr/src/expr_fn.rs | 3 +
datafusion/physical-expr/src/array_expressions.rs | 83 ++++++++++++++++++++++-
datafusion/physical-expr/src/functions.rs | 3 +
datafusion/proto/proto/datafusion.proto | 1 +
datafusion/proto/src/generated/pbjson.rs | 3 +
datafusion/proto/src/generated/prost.rs | 3 +
datafusion/proto/src/logical_plan/from_proto.rs | 15 ++--
datafusion/proto/src/logical_plan/to_proto.rs | 1 +
datafusion/sqllogictest/test_files/array.slt | 50 ++++++++++++--
docs/source/user-guide/sql/scalar_functions.md | 36 ++++++++++
11 files changed, 194 insertions(+), 12 deletions(-)
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index d48e9e7a67..44fbf45525 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -130,6 +130,8 @@ pub enum BuiltinScalarFunction {
// array functions
/// array_append
ArrayAppend,
+ /// array_sort
+ ArraySort,
/// array_concat
ArrayConcat,
/// array_has
@@ -398,6 +400,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Tanh => Volatility::Immutable,
BuiltinScalarFunction::Trunc => Volatility::Immutable,
BuiltinScalarFunction::ArrayAppend => Volatility::Immutable,
+ BuiltinScalarFunction::ArraySort => Volatility::Immutable,
BuiltinScalarFunction::ArrayConcat => Volatility::Immutable,
BuiltinScalarFunction::ArrayEmpty => Volatility::Immutable,
BuiltinScalarFunction::ArrayHasAll => Volatility::Immutable,
@@ -545,6 +548,7 @@ impl BuiltinScalarFunction {
Ok(data_type)
}
BuiltinScalarFunction::ArrayAppend =>
Ok(input_expr_types[0].clone()),
+ BuiltinScalarFunction::ArraySort =>
Ok(input_expr_types[0].clone()),
BuiltinScalarFunction::ArrayConcat => {
let mut expr_type = Null;
let mut max_dims = 0;
@@ -909,6 +913,9 @@ impl BuiltinScalarFunction {
// for now, the list is small, as we do not have many built-in
functions.
match self {
BuiltinScalarFunction::ArrayAppend => Signature::any(2,
self.volatility()),
+ BuiltinScalarFunction::ArraySort => {
+ Signature::variadic_any(self.volatility())
+ }
BuiltinScalarFunction::ArrayPopFront => Signature::any(1,
self.volatility()),
BuiltinScalarFunction::ArrayPopBack => Signature::any(1,
self.volatility()),
BuiltinScalarFunction::ArrayConcat => {
@@ -1558,6 +1565,7 @@ impl BuiltinScalarFunction {
"array_push_back",
"list_push_back",
],
+ BuiltinScalarFunction::ArraySort => &["array_sort", "list_sort"],
BuiltinScalarFunction::ArrayConcat => {
&["array_concat", "array_cat", "list_concat", "list_cat"]
}
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 6148226f6b..8d25619c07 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -583,6 +583,8 @@ scalar_expr!(
"appends an element to the end of an array."
);
+scalar_expr!(ArraySort, array_sort, array desc null_first, "returns sorted
array.");
+
scalar_expr!(
ArrayPopBack,
array_pop_back,
@@ -1184,6 +1186,7 @@ mod test {
test_scalar_expr!(FromUnixtime, from_unixtime, unixtime);
test_scalar_expr!(ArrayAppend, array_append, array, element);
+ test_scalar_expr!(ArraySort, array_sort, array, desc, null_first);
test_scalar_expr!(ArrayPopFront, array_pop_front, array);
test_scalar_expr!(ArrayPopBack, array_pop_back, array);
test_unary_scalar_expr!(ArrayDims, array_dims);
diff --git a/datafusion/physical-expr/src/array_expressions.rs
b/datafusion/physical-expr/src/array_expressions.rs
index f254274edd..269bbf7dcf 100644
--- a/datafusion/physical-expr/src/array_expressions.rs
+++ b/datafusion/physical-expr/src/array_expressions.rs
@@ -29,7 +29,7 @@ use arrow::datatypes::{DataType, Field, UInt64Type};
use arrow::row::{RowConverter, SortField};
use arrow_buffer::NullBuffer;
-use arrow_schema::FieldRef;
+use arrow_schema::{FieldRef, SortOptions};
use datafusion_common::cast::{
as_generic_list_array, as_generic_string_array, as_int64_array,
as_list_array,
as_null_array, as_string_array,
@@ -693,7 +693,7 @@ fn general_append_and_prepend(
/// # Arguments
///
/// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and
step(step value can not be zero.) values.
-///
+///
/// # Examples
///
/// gen_range(3) => [0, 1, 2]
@@ -777,6 +777,85 @@ pub fn array_append(args: &[ArrayRef]) -> Result<ArrayRef>
{
Ok(res)
}
+/// Array_sort SQL function
+pub fn array_sort(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let sort_option = match args.len() {
+ 1 => None,
+ 2 => {
+ let sort = as_string_array(&args[1])?.value(0);
+ Some(SortOptions {
+ descending: order_desc(sort)?,
+ nulls_first: true,
+ })
+ }
+ 3 => {
+ let sort = as_string_array(&args[1])?.value(0);
+ let nulls_first = as_string_array(&args[2])?.value(0);
+ Some(SortOptions {
+ descending: order_desc(sort)?,
+ nulls_first: order_nulls_first(nulls_first)?,
+ })
+ }
+ _ => return internal_err!("array_sort expects 1 to 3 arguments"),
+ };
+
+ let list_array = as_list_array(&args[0])?;
+ let row_count = list_array.len();
+
+ let mut array_lengths = vec![];
+ let mut arrays = vec![];
+ let mut valid = BooleanBufferBuilder::new(row_count);
+ for i in 0..row_count {
+ if list_array.is_null(i) {
+ array_lengths.push(0);
+ valid.append(false);
+ } else {
+ let arr_ref = list_array.value(i);
+ let arr_ref = arr_ref.as_ref();
+
+ let sorted_array = compute::sort(arr_ref, sort_option)?;
+ array_lengths.push(sorted_array.len());
+ arrays.push(sorted_array);
+ valid.append(true);
+ }
+ }
+
+ // Assume all arrays have the same data type
+ let data_type = list_array.value_type();
+ let buffer = valid.finish();
+
+ let elements = arrays
+ .iter()
+ .map(|a| a.as_ref())
+ .collect::<Vec<&dyn Array>>();
+
+ let list_arr = ListArray::new(
+ Arc::new(Field::new("item", data_type, true)),
+ OffsetBuffer::from_lengths(array_lengths),
+ Arc::new(compute::concat(elements.as_slice())?),
+ Some(NullBuffer::new(buffer)),
+ );
+ Ok(Arc::new(list_arr))
+}
+
+fn order_desc(modifier: &str) -> Result<bool> {
+ match modifier.to_uppercase().as_str() {
+ "DESC" => Ok(true),
+ "ASC" => Ok(false),
+ _ => internal_err!("the second parameter of array_sort expects DESC or
ASC"),
+ }
+}
+
+fn order_nulls_first(modifier: &str) -> Result<bool> {
+ match modifier.to_uppercase().as_str() {
+ "NULLS FIRST" => Ok(true),
+ "NULLS LAST" => Ok(false),
+ _ => internal_err!(
+ "the third parameter of array_sort expects NULLS FIRST or NULLS
LAST"
+ ),
+ }
+}
+
/// Array_prepend SQL function
pub fn array_prepend(args: &[ArrayRef]) -> Result<ArrayRef> {
let list_array = as_list_array(&args[1])?;
diff --git a/datafusion/physical-expr/src/functions.rs
b/datafusion/physical-expr/src/functions.rs
index b5d7a8e97d..873864a57a 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -329,6 +329,9 @@ pub fn create_physical_fun(
BuiltinScalarFunction::ArrayAppend => {
Arc::new(|args|
make_scalar_function(array_expressions::array_append)(args))
}
+ BuiltinScalarFunction::ArraySort => {
+ Arc::new(|args|
make_scalar_function(array_expressions::array_sort)(args))
+ }
BuiltinScalarFunction::ArrayConcat => {
Arc::new(|args|
make_scalar_function(array_expressions::array_concat)(args))
}
diff --git a/datafusion/proto/proto/datafusion.proto
b/datafusion/proto/proto/datafusion.proto
index 64b8e28074..863e3c315c 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -644,6 +644,7 @@ enum ScalarFunction {
Levenshtein = 125;
SubstrIndex = 126;
FindInSet = 127;
+ ArraySort = 128;
}
message ScalarFunctionNode {
diff --git a/datafusion/proto/src/generated/pbjson.rs
b/datafusion/proto/src/generated/pbjson.rs
index 34ad63d819..74798ee8e9 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -20905,6 +20905,7 @@ impl serde::Serialize for ScalarFunction {
Self::Levenshtein => "Levenshtein",
Self::SubstrIndex => "SubstrIndex",
Self::FindInSet => "FindInSet",
+ Self::ArraySort => "ArraySort",
};
serializer.serialize_str(variant)
}
@@ -21044,6 +21045,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Levenshtein",
"SubstrIndex",
"FindInSet",
+ "ArraySort",
];
struct GeneratedVisitor;
@@ -21212,6 +21214,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Levenshtein" => Ok(ScalarFunction::Levenshtein),
"SubstrIndex" => Ok(ScalarFunction::SubstrIndex),
"FindInSet" => Ok(ScalarFunction::FindInSet),
+ "ArraySort" => Ok(ScalarFunction::ArraySort),
_ => Err(serde::de::Error::unknown_variant(value, FIELDS)),
}
}
diff --git a/datafusion/proto/src/generated/prost.rs
b/datafusion/proto/src/generated/prost.rs
index 8b4dd1b759..ae20913e3d 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2601,6 +2601,7 @@ pub enum ScalarFunction {
Levenshtein = 125,
SubstrIndex = 126,
FindInSet = 127,
+ ArraySort = 128,
}
impl ScalarFunction {
/// String value of the enum field names used in the ProtoBuf definition.
@@ -2737,6 +2738,7 @@ impl ScalarFunction {
ScalarFunction::Levenshtein => "Levenshtein",
ScalarFunction::SubstrIndex => "SubstrIndex",
ScalarFunction::FindInSet => "FindInSet",
+ ScalarFunction::ArraySort => "ArraySort",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
@@ -2870,6 +2872,7 @@ impl ScalarFunction {
"Levenshtein" => Some(Self::Levenshtein),
"SubstrIndex" => Some(Self::SubstrIndex),
"FindInSet" => Some(Self::FindInSet),
+ "ArraySort" => Some(Self::ArraySort),
_ => None,
}
}
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs
b/datafusion/proto/src/logical_plan/from_proto.rs
index ae3628bdde..13576aaa08 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -44,10 +44,11 @@ use datafusion_expr::{
array_except, array_has, array_has_all, array_has_any, array_intersect,
array_length,
array_ndims, array_position, array_positions, array_prepend, array_remove,
array_remove_all, array_remove_n, array_repeat, array_replace,
array_replace_all,
- array_replace_n, array_slice, array_to_string, arrow_typeof, ascii, asin,
asinh,
- atan, atan2, atanh, bit_length, btrim, cardinality, cbrt, ceil,
character_length,
- chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, current_date,
- current_time, date_bin, date_part, date_trunc, decode, degrees, digest,
encode, exp,
+ array_replace_n, array_slice, array_sort, array_to_string, arrow_typeof,
ascii, asin,
+ asinh, atan, atan2, atanh, bit_length, btrim, cardinality, cbrt, ceil,
+ character_length, chr, coalesce, concat_expr, concat_ws_expr, cos, cosh,
cot,
+ current_date, current_time, date_bin, date_part, date_trunc, decode,
degrees, digest,
+ encode, exp,
expr::{self, InList, Sort, WindowFunction},
factorial, find_in_set, flatten, floor, from_unixtime, gcd, gen_range,
isnan, iszero,
lcm, left, levenshtein, ln, log, log10, log2,
@@ -463,6 +464,7 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::Rtrim => Self::Rtrim,
ScalarFunction::ToTimestamp => Self::ToTimestamp,
ScalarFunction::ArrayAppend => Self::ArrayAppend,
+ ScalarFunction::ArraySort => Self::ArraySort,
ScalarFunction::ArrayConcat => Self::ArrayConcat,
ScalarFunction::ArrayEmpty => Self::ArrayEmpty,
ScalarFunction::ArrayExcept => Self::ArrayExcept,
@@ -1343,6 +1345,11 @@ pub fn parse_expr(
parse_expr(&args[0], registry)?,
parse_expr(&args[1], registry)?,
)),
+ ScalarFunction::ArraySort => Ok(array_sort(
+ parse_expr(&args[0], registry)?,
+ parse_expr(&args[1], registry)?,
+ parse_expr(&args[2], registry)?,
+ )),
ScalarFunction::ArrayPopFront => {
Ok(array_pop_front(parse_expr(&args[0], registry)?))
}
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs
b/datafusion/proto/src/logical_plan/to_proto.rs
index ecbfaca5db..0af8d9f3e7 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1502,6 +1502,7 @@ impl TryFrom<&BuiltinScalarFunction> for
protobuf::ScalarFunction {
BuiltinScalarFunction::Rtrim => Self::Rtrim,
BuiltinScalarFunction::ToTimestamp => Self::ToTimestamp,
BuiltinScalarFunction::ArrayAppend => Self::ArrayAppend,
+ BuiltinScalarFunction::ArraySort => Self::ArraySort,
BuiltinScalarFunction::ArrayConcat => Self::ArrayConcat,
BuiltinScalarFunction::ArrayEmpty => Self::ArrayEmpty,
BuiltinScalarFunction::ArrayExcept => Self::ArrayExcept,
diff --git a/datafusion/sqllogictest/test_files/array.slt
b/datafusion/sqllogictest/test_files/array.slt
index d8bf441d71..3c23dd369a 100644
--- a/datafusion/sqllogictest/test_files/array.slt
+++ b/datafusion/sqllogictest/test_files/array.slt
@@ -1052,6 +1052,44 @@ select make_array(['a','b'], null);
----
[[a, b], ]
+## array_sort (aliases: `list_sort`)
+query ???
+select array_sort(make_array(1, 3, null, 5, NULL, -5)),
array_sort(make_array(1, 3, null, 2), 'ASC'), array_sort(make_array(1, 3, null,
2), 'desc', 'NULLS FIRST');
+----
+[, , -5, 1, 3, 5] [, 1, 2, 3] [, 3, 2, 1]
+
+query ?
+select array_sort(column1, 'DESC', 'NULLS LAST') from arrays_values;
+----
+[10, 9, 8, 7, 6, 5, 4, 3, 2, ]
+[20, 18, 17, 16, 15, 14, 13, 12, 11, ]
+[30, 29, 28, 27, 26, 25, 23, 22, 21, ]
+[40, 39, 38, 37, 35, 34, 33, 32, 31, ]
+NULL
+[50, 49, 48, 47, 46, 45, 44, 43, 42, 41]
+[60, 59, 58, 57, 56, 55, 54, 52, 51, ]
+[70, 69, 68, 67, 66, 65, 64, 63, 62, 61]
+
+query ?
+select array_sort(column1, 'ASC', 'NULLS FIRST') from arrays_values;
+----
+[, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+[, 11, 12, 13, 14, 15, 16, 17, 18, 20]
+[, 21, 22, 23, 25, 26, 27, 28, 29, 30]
+[, 31, 32, 33, 34, 35, 37, 38, 39, 40]
+NULL
+[41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
+[, 51, 52, 54, 55, 56, 57, 58, 59, 60]
+[61, 62, 63, 64, 65, 66, 67, 68, 69, 70]
+
+
+## list_sort (aliases: `array_sort`)
+query ???
+select list_sort(make_array(1, 3, null, 5, NULL, -5)), list_sort(make_array(1,
3, null, 2), 'ASC'), list_sort(make_array(1, 3, null, 2), 'desc', 'NULLS
FIRST');
+----
+[, , -5, 1, 3, 5] [, 1, 2, 3] [, 3, 2, 1]
+
+
## array_append (aliases: `list_append`, `array_push_back`, `list_push_back`)
# TODO: array_append with NULLs
@@ -1224,7 +1262,7 @@ select array_prepend(make_array(1, 11, 111), column1),
array_prepend(column2, ma
# array_repeat scalar function #1
query ????????
-select
+select
array_repeat(1, 5),
array_repeat(3.14, 3),
array_repeat('l', 4),
@@ -1257,7 +1295,7 @@ AS VALUES
(0, 3, 3.3, 'datafusion', make_array(8, 9));
query ??????
-select
+select
array_repeat(column2, column1),
array_repeat(column3, column1),
array_repeat(column4, column1),
@@ -1272,7 +1310,7 @@ from array_repeat_table;
[] [] [] [] [3, 3, 3] []
statement ok
-drop table array_repeat_table;
+drop table array_repeat_table;
## array_concat (aliases: `array_cat`, `list_concat`, `list_cat`)
@@ -2188,7 +2226,7 @@ select array_remove(make_array(1, 2, 2, 1, 1), 2),
array_remove(make_array(1.0,
[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o]
query ???
-select
+select
array_remove(make_array(1, null, 2, 3), 2),
array_remove(make_array(1.1, null, 2.2, 3.3), 1.1),
array_remove(make_array('a', null, 'bc'), 'a');
@@ -2887,7 +2925,7 @@ from array_intersect_table_3D;
query ??????
SELECT array_intersect(make_array(1,2,3), make_array(2,3,4)),
array_intersect(make_array(1,3,5), make_array(2,4,6)),
- array_intersect(make_array('aa','bb','cc'),
make_array('cc','aa','dd')),
+ array_intersect(make_array('aa','bb','cc'),
make_array('cc','aa','dd')),
array_intersect(make_array(true, false), make_array(true)),
array_intersect(make_array(1.1, 2.2, 3.3), make_array(2.2, 3.3, 4.4)),
array_intersect(make_array([1, 1], [2, 2], [3, 3]), make_array([2, 2],
[3, 3], [4, 4]))
@@ -2918,7 +2956,7 @@ NULL
query ??????
SELECT list_intersect(make_array(1,2,3), make_array(2,3,4)),
list_intersect(make_array(1,3,5), make_array(2,4,6)),
- list_intersect(make_array('aa','bb','cc'),
make_array('cc','aa','dd')),
+ list_intersect(make_array('aa','bb','cc'), make_array('cc','aa','dd')),
list_intersect(make_array(true, false), make_array(true)),
list_intersect(make_array(1.1, 2.2, 3.3), make_array(2.2, 3.3, 4.4)),
list_intersect(make_array([1, 1], [2, 2], [3, 3]), make_array([2, 2],
[3, 3], [4, 4]))
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index 46920f1c4d..9a9bec9df7 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -1555,6 +1555,7 @@ from_unixtime(expression)
## Array Functions
- [array_append](#array_append)
+- [array_sort](#array_sort)
- [array_cat](#array_cat)
- [array_concat](#array_concat)
- [array_contains](#array_contains)
@@ -1584,6 +1585,7 @@ from_unixtime(expression)
- [cardinality](#cardinality)
- [empty](#empty)
- [list_append](#list_append)
+- [list_sort](#list_sort)
- [list_cat](#list_cat)
- [list_concat](#list_concat)
- [list_dims](#list_dims)
@@ -1645,6 +1647,36 @@ array_append(array, element)
- list_append
- list_push_back
+### `array_sort`
+
+Sort array.
+
+```
+array_sort(array, desc, nulls_first)
+```
+
+#### Arguments
+
+- **array**: Array expression.
+ Can be a constant, column, or function, and any combination of array
operators.
+- **desc**: Whether to sort in descending order(`ASC` or `DESC`).
+- **nulls_first**: Whether to sort nulls first(`NULLS FIRST` or `NULLS LAST`).
+
+#### Example
+
+```
+❯ select array_sort([3, 1, 2]);
++-----------------------------+
+| array_sort(List([3,1,2])) |
++-----------------------------+
+| [1, 2, 3] |
++-----------------------------+
+```
+
+#### Aliases
+
+- list_sort
+
### `array_cat`
_Alias of [array_concat](#array_concat)._
@@ -2433,6 +2465,10 @@ empty(array)
_Alias of [array_append](#array_append)._
+### `list_sort`
+
+_Alias of [array_sort](#array_sort)._
+
### `list_cat`
_Alias of [array_concat](#array_concat)._