This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 8290a4f3f feat: cast List / LargeList to Utf8 / LargeUtf8 (#2588)
8290a4f3f is described below
commit 8290a4f3fb90f6715ba977e71618df73f6c66d20
Author: George Andronchik <[email protected]>
AuthorDate: Tue Oct 4 17:04:27 2022 +0800
feat: cast List / LargeList to Utf8 / LargeUtf8 (#2588)
---
arrow/src/compute/kernels/cast.rs | 89 ++++++++++++++++++++++++++++++++++++++-
arrow/src/util/display.rs | 17 ++++++++
2 files changed, 105 insertions(+), 1 deletion(-)
diff --git a/arrow/src/compute/kernels/cast.rs
b/arrow/src/compute/kernels/cast.rs
index eab3dafda..31ac738fa 100644
--- a/arrow/src/compute/kernels/cast.rs
+++ b/arrow/src/compute/kernels/cast.rs
@@ -57,7 +57,10 @@ use crate::temporal_conversions::{
NANOSECONDS, SECONDS_IN_DAY,
};
use crate::{array::*, compute::take};
-use crate::{buffer::Buffer, util::serialization::lexical_to_string};
+use crate::{
+ buffer::Buffer, util::display::array_value_to_string,
+ util::serialization::lexical_to_string,
+};
use num::cast::AsPrimitive;
use num::{BigInt, NumCast, ToPrimitive};
@@ -136,6 +139,10 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
(List(list_from), LargeList(list_to)) => {
list_from.data_type() == list_to.data_type()
}
+ (LargeList(list_from), List(list_to)) => {
+ list_from.data_type() == list_to.data_type()
+ }
+ (List(list_from) | LargeList(list_from), Utf8 | LargeUtf8) =>
can_cast_types(list_from.data_type(), to_type),
(List(_), _) => false,
(_, List(list_to)) => can_cast_types(from_type, list_to.data_type()),
(_, LargeList(list_to)) => can_cast_types(from_type,
list_to.data_type()),
@@ -408,6 +415,21 @@ macro_rules! cast_decimal_to_float {
}};
}
+// cast the List array to Utf8 array
+macro_rules! cast_list_to_string {
+ ($ARRAY:expr, $SIZE:ident) => {{
+ let mut value_builder: GenericStringBuilder<$SIZE> =
GenericStringBuilder::new();
+ for i in 0..$ARRAY.len() {
+ if $ARRAY.is_null(i) {
+ value_builder.append_null();
+ } else {
+ value_builder.append_value(array_value_to_string($ARRAY, i)?);
+ }
+ }
+ Ok(Arc::new(value_builder.finish()))
+ }};
+}
+
/// Cast `array` to the provided data type and return a new Array with
/// type `to_type`, if possible. It accepts `CastOptions` to allow consumers
/// to configure cast behavior.
@@ -585,6 +607,8 @@ pub fn cast_with_options(
cast_list_container::<i64, i32>(&**array, cast_options)
}
}
+ (List(_) | LargeList(_), Utf8) => cast_list_to_string!(array, i32),
+ (List(_) | LargeList(_), LargeUtf8) => cast_list_to_string!(array,
i64),
(List(_), _) => Err(ArrowError::CastError(
"Cannot cast list to non-list data types".to_string(),
)),
@@ -5764,4 +5788,67 @@ mod tests {
&expected
);
}
+
+ #[test]
+ fn test_list_to_string() {
+ let str_array = StringArray::from(vec!["a", "b", "c", "d", "e", "f",
"g", "h"]);
+ let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]);
+ let value_data = ArrayData::builder(DataType::Utf8)
+ .len(str_array.len())
+ .buffers(str_array.data().buffers().to_vec())
+ .build()
+ .unwrap();
+
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Utf8, true)));
+ let list_data = ArrayData::builder(list_data_type)
+ .len(3)
+ .add_buffer(value_offsets)
+ .add_child_data(value_data)
+ .build()
+ .unwrap();
+ let array = Arc::new(ListArray::from(list_data)) as ArrayRef;
+
+ let out = cast(&array, &DataType::Utf8).unwrap();
+ let out = out
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap()
+ .into_iter()
+ .flatten()
+ .collect::<Vec<_>>();
+ assert_eq!(&out, &vec!["[a, b, c]", "[d, e, f]", "[g, h]"]);
+
+ let out = cast(&array, &DataType::LargeUtf8).unwrap();
+ let out = out
+ .as_any()
+ .downcast_ref::<LargeStringArray>()
+ .unwrap()
+ .into_iter()
+ .flatten()
+ .collect::<Vec<_>>();
+ assert_eq!(&out, &vec!["[a, b, c]", "[d, e, f]", "[g, h]"]);
+
+ let array = Arc::new(make_list_array()) as ArrayRef;
+ let out = cast(&array, &DataType::Utf8).unwrap();
+ let out = out
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap()
+ .into_iter()
+ .flatten()
+ .collect::<Vec<_>>();
+ assert_eq!(&out, &vec!["[0, 1, 2]", "[3, 4, 5]", "[6, 7]"]);
+
+ let array = Arc::new(make_large_list_array()) as ArrayRef;
+ let out = cast(&array, &DataType::LargeUtf8).unwrap();
+ let out = out
+ .as_any()
+ .downcast_ref::<LargeStringArray>()
+ .unwrap()
+ .into_iter()
+ .flatten()
+ .collect::<Vec<_>>();
+ assert_eq!(&out, &vec!["[0, 1, 2]", "[3, 4, 5]", "[6, 7]"]);
+ }
}
diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs
index aa4fd4200..cf8394efa 100644
--- a/arrow/src/util/display.rs
+++ b/arrow/src/util/display.rs
@@ -235,6 +235,22 @@ macro_rules! make_string_from_list {
}};
}
+macro_rules! make_string_from_large_list {
+ ($column: ident, $row: ident) => {{
+ let list = $column
+ .as_any()
+ .downcast_ref::<array::LargeListArray>()
+ .ok_or(ArrowError::InvalidArgumentError(format!(
+ "Repl error: could not convert large list column to list
array."
+ )))?
+ .value($row);
+ let string_values = (0..list.len())
+ .map(|i| array_value_to_string(&list, i))
+ .collect::<Result<Vec<String>>>()?;
+ Ok(format!("[{}]", string_values.join(", ")))
+ }};
+}
+
macro_rules! make_string_from_fixed_size_list {
($column: ident, $row: ident) => {{
let list = $column
@@ -357,6 +373,7 @@ pub fn array_value_to_string(column: &array::ArrayRef, row:
usize) -> Result<Str
}
},
DataType::List(_) => make_string_from_list!(column, row),
+ DataType::LargeList(_) => make_string_from_large_list!(column, row),
DataType::Dictionary(index_type, _value_type) => match **index_type {
DataType::Int8 => dict_array_value_to_string::<Int8Type>(column,
row),
DataType::Int16 => dict_array_value_to_string::<Int16Type>(column,
row),