alamb commented on code in PR #4718:
URL: https://github.com/apache/arrow-rs/pull/4718#discussion_r1303399321
##########
arrow-string/src/length.rs:
##########
@@ -194,28 +98,42 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
/// * bit_length of null is null.
/// * bit_length is in number of bits
pub fn bit_length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
+ if let Some(d) = array.as_any_dictionary_opt() {
+ let lengths = bit_length(d.values().as_ref())?;
+ return Ok(d.with_values(lengths));
+ }
+
match array.data_type() {
- DataType::Dictionary(kt, _) => {
- kernel_dict!(
- array,
- |a| { bit_length(a) },
- kt,
- Int8: Int8Type,
- Int16: Int16Type,
- Int32: Int32Type,
- Int64: Int64Type,
- UInt8: UInt8Type,
- UInt16: UInt16Type,
- UInt32: UInt32Type,
- UInt64: UInt64Type
- )
+ DataType::List(_) => {
+ let list = array.as_list::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
}
- DataType::Utf8 => Ok(bit_length_string::<i32, Int32Type>(array)),
- DataType::LargeUtf8 => Ok(bit_length_string::<i64, Int64Type>(array)),
- DataType::Binary => Ok(bit_length_binary::<i32, Int32Type>(array)),
- DataType::LargeBinary => Ok(bit_length_binary::<i64,
Int64Type>(array)),
+ DataType::LargeList(_) => {
+ let list = array.as_list::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::Utf8 => {
+ let list = array.as_string::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
+ }
+ DataType::LargeUtf8 => {
+ let list = array.as_string::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::Binary => {
+ let list = array.as_binary::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
+ }
+ DataType::LargeBinary => {
+ let list = array.as_binary::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::FixedSizeBinary(len) => Ok(Arc::new(Int32Array::new(
Review Comment:
the support for FixedSizeBinary seems to be new here but there is no test
update -- maybe we can add a test for it?
##########
arrow-string/src/length.rs:
##########
@@ -194,28 +98,42 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
/// * bit_length of null is null.
/// * bit_length is in number of bits
pub fn bit_length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
+ if let Some(d) = array.as_any_dictionary_opt() {
+ let lengths = bit_length(d.values().as_ref())?;
+ return Ok(d.with_values(lengths));
+ }
+
match array.data_type() {
- DataType::Dictionary(kt, _) => {
- kernel_dict!(
- array,
- |a| { bit_length(a) },
- kt,
- Int8: Int8Type,
- Int16: Int16Type,
- Int32: Int32Type,
- Int64: Int64Type,
- UInt8: UInt8Type,
- UInt16: UInt16Type,
- UInt32: UInt32Type,
- UInt64: UInt64Type
- )
+ DataType::List(_) => {
+ let list = array.as_list::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
}
- DataType::Utf8 => Ok(bit_length_string::<i32, Int32Type>(array)),
- DataType::LargeUtf8 => Ok(bit_length_string::<i64, Int64Type>(array)),
- DataType::Binary => Ok(bit_length_binary::<i32, Int32Type>(array)),
- DataType::LargeBinary => Ok(bit_length_binary::<i64,
Int64Type>(array)),
+ DataType::LargeList(_) => {
+ let list = array.as_list::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::Utf8 => {
+ let list = array.as_string::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
+ }
+ DataType::LargeUtf8 => {
+ let list = array.as_string::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::Binary => {
+ let list = array.as_binary::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
+ }
+ DataType::LargeBinary => {
+ let list = array.as_binary::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::FixedSizeBinary(len) => Ok(Arc::new(Int32Array::new(
+ vec![*len * 8; array.len()].into(),
+ array.nulls().cloned(),
+ ))),
other => Err(ArrowError::ComputeError(format!(
- "bit_length not supported for {other:?}"
+ "length not supported for {other:?}"
Review Comment:
I think the prior message was correct as this is the `bit_length`
implementation 🤔
##########
arrow-string/src/length.rs:
##########
@@ -19,135 +19,29 @@
use arrow_array::*;
use arrow_array::{cast::AsArray, types::*};
-use arrow_buffer::Buffer;
-use arrow_data::ArrayData;
+use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer};
use arrow_schema::{ArrowError, DataType};
use std::sync::Arc;
-macro_rules! unary_offsets {
- ($array: expr, $data_type: expr, $op: expr) => {{
- let slice = $array.value_offsets();
-
- let lengths = slice.windows(2).map(|offset| $op(offset[1] -
offset[0]));
-
- // JUSTIFICATION
- // Benefit
- // ~60% speedup
- // Soundness
- // `values` come from a slice iterator with a known size.
- let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };
-
- let null_bit_buffer = $array.nulls().map(|b| b.inner().sliced());
-
- let data = unsafe {
- ArrayData::new_unchecked(
- $data_type,
- $array.len(),
- None,
- null_bit_buffer,
- 0,
- vec![buffer],
- vec![],
- )
- };
- make_array(data)
- }};
+fn length_impl<P: ArrowPrimitiveType>(
+ offsets: &OffsetBuffer<P::Native>,
+ nulls: Option<&NullBuffer>,
+) -> ArrayRef {
+ let v: Vec<_> = offsets
+ .windows(2)
Review Comment:
this is very clever 👍
##########
arrow-string/src/length.rs:
##########
@@ -19,135 +19,29 @@
use arrow_array::*;
use arrow_array::{cast::AsArray, types::*};
-use arrow_buffer::Buffer;
-use arrow_data::ArrayData;
+use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer};
use arrow_schema::{ArrowError, DataType};
use std::sync::Arc;
-macro_rules! unary_offsets {
- ($array: expr, $data_type: expr, $op: expr) => {{
- let slice = $array.value_offsets();
-
- let lengths = slice.windows(2).map(|offset| $op(offset[1] -
offset[0]));
-
- // JUSTIFICATION
- // Benefit
- // ~60% speedup
- // Soundness
- // `values` come from a slice iterator with a known size.
- let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };
-
- let null_bit_buffer = $array.nulls().map(|b| b.inner().sliced());
-
- let data = unsafe {
- ArrayData::new_unchecked(
- $data_type,
- $array.len(),
- None,
- null_bit_buffer,
- 0,
- vec![buffer],
- vec![],
- )
- };
- make_array(data)
- }};
+fn length_impl<P: ArrowPrimitiveType>(
+ offsets: &OffsetBuffer<P::Native>,
+ nulls: Option<&NullBuffer>,
+) -> ArrayRef {
+ let v: Vec<_> = offsets
+ .windows(2)
Review Comment:
this is very clever 👍
##########
arrow-string/src/length.rs:
##########
@@ -194,28 +98,42 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
/// * bit_length of null is null.
/// * bit_length is in number of bits
pub fn bit_length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
+ if let Some(d) = array.as_any_dictionary_opt() {
+ let lengths = bit_length(d.values().as_ref())?;
+ return Ok(d.with_values(lengths));
+ }
+
match array.data_type() {
- DataType::Dictionary(kt, _) => {
- kernel_dict!(
- array,
- |a| { bit_length(a) },
- kt,
- Int8: Int8Type,
- Int16: Int16Type,
- Int32: Int32Type,
- Int64: Int64Type,
- UInt8: UInt8Type,
- UInt16: UInt16Type,
- UInt32: UInt32Type,
- UInt64: UInt64Type
- )
+ DataType::List(_) => {
+ let list = array.as_list::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
}
- DataType::Utf8 => Ok(bit_length_string::<i32, Int32Type>(array)),
- DataType::LargeUtf8 => Ok(bit_length_string::<i64, Int64Type>(array)),
- DataType::Binary => Ok(bit_length_binary::<i32, Int32Type>(array)),
- DataType::LargeBinary => Ok(bit_length_binary::<i64,
Int64Type>(array)),
+ DataType::LargeList(_) => {
+ let list = array.as_list::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::Utf8 => {
+ let list = array.as_string::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
+ }
+ DataType::LargeUtf8 => {
+ let list = array.as_string::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::Binary => {
+ let list = array.as_binary::<i32>();
+ Ok(bit_length_impl::<Int32Type>(list.offsets(), list.nulls()))
+ }
+ DataType::LargeBinary => {
+ let list = array.as_binary::<i64>();
+ Ok(bit_length_impl::<Int64Type>(list.offsets(), list.nulls()))
+ }
+ DataType::FixedSizeBinary(len) => Ok(Arc::new(Int32Array::new(
+ vec![*len * 8; array.len()].into(),
+ array.nulls().cloned(),
+ ))),
other => Err(ArrowError::ComputeError(format!(
- "bit_length not supported for {other:?}"
+ "length not supported for {other:?}"
Review Comment:
I think the prior message was correct as this is the `bit_length`
implementation 🤔
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]