This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 350ea26c6 Support `Utf8View` for `bit_length` kernel (#6671)
350ea26c6 is described below
commit 350ea26c6cc646baefb72e39e138d10f9261f71e
Author: Austin Liu <[email protected]>
AuthorDate: Wed Nov 6 00:13:19 2024 +0800
Support `Utf8View` for `bit_length` kernel (#6671)
* Support `Utf8View` for string function `bit_length()`
Signed-off-by: Austin Liu <[email protected]>
* Add test & handle view bytes length counting
Signed-off-by: Austin Liu <[email protected]>
Add test & handle view bytes length counting
Signed-off-by: Austin Liu <[email protected]>
* Refine `string_view_array`
Signed-off-by: Austin Liu <[email protected]>
* Make length from `i32` to `u32` & check nullity
Signed-off-by: Austin Liu <[email protected]>
* Clean up
Signed-off-by: Austin Liu <[email protected]>
* Refine
Signed-off-by: Austin Liu <[email protected]>
* Use `from_unary` instead
Signed-off-by: Austin Liu <[email protected]>
* Prevent inspect the string data
Signed-off-by: Austin Liu <[email protected]>
* Clean up
Signed-off-by: Austin Liu <[email protected]>
---------
Signed-off-by: Austin Liu <[email protected]>
---
arrow-string/src/length.rs | 38 ++++++++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)
diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs
index 97f876a9f..6a28d44ea 100644
--- a/arrow-string/src/length.rs
+++ b/arrow-string/src/length.rs
@@ -137,6 +137,15 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
let list = array.as_string::<i64>();
Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
}
+ DataType::Utf8View => {
+ let list = array.as_string_view();
+ let values = list
+ .views()
+ .iter()
+ .map(|view| (*view as i32).wrapping_mul(8))
+ .collect();
+ Ok(Arc::new(Int32Array::new(values, array.nulls().cloned())))
+ }
DataType::Binary => {
let list = array.as_binary::<i32>();
Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
@@ -462,6 +471,35 @@ mod tests {
})
}
+ #[test]
+ fn bit_length_test_utf8view() {
+ bit_length_cases()
+ .into_iter()
+ .for_each(|(input, len, expected)| {
+ let string_array = StringViewArray::from(input);
+ let result = bit_length(&string_array).unwrap();
+ assert_eq!(len, result.len());
+ let result =
result.as_any().downcast_ref::<Int32Array>().unwrap();
+ expected.iter().enumerate().for_each(|(i, value)| {
+ assert_eq!(*value, result.value(i));
+ });
+ })
+ }
+
+ #[test]
+ fn bit_length_null_utf8view() {
+ bit_length_null_cases()
+ .into_iter()
+ .for_each(|(input, len, expected)| {
+ let array = StringArray::from(input);
+ let result = bit_length(&array).unwrap();
+ assert_eq!(len, result.len());
+ let result =
result.as_any().downcast_ref::<Int32Array>().unwrap();
+
+ let expected: Int32Array = expected.into();
+ assert_eq!(&expected, result);
+ })
+ }
#[test]
fn bit_length_binary() {
let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"];