alamb commented on code in PR #9732:
URL: https://github.com/apache/arrow-datafusion/pull/9732#discussion_r1550469799
##########
datafusion/physical-expr/src/string_expressions.rs:
##########
@@ -38,75 +40,147 @@ use datafusion_common::{
};
use datafusion_expr::ColumnarValue;
+enum ColumnarValueRef<'a> {
+ Scalar(&'a [u8]),
+ NullableArray(&'a StringArray),
+ NonNullableArray(&'a StringArray),
+}
+
+impl<'a> ColumnarValueRef<'a> {
+ #[inline]
+ fn is_valid(&self, i: usize) -> bool {
+ match &self {
+ Self::Scalar(_) | Self::NonNullableArray(_) => true,
+ Self::NullableArray(array) => array.is_valid(i),
+ }
+ }
+
+ #[inline]
+ fn nulls(&self) -> Option<NullBuffer> {
+ match &self {
+ Self::Scalar(_) | Self::NonNullableArray(_) => None,
+ Self::NullableArray(array) => array.nulls().cloned(),
+ }
+ }
+}
+
+struct StringArrayBuilder {
+ offsets_buffer: MutableBuffer,
+ value_buffer: MutableBuffer,
+}
+
+impl StringArrayBuilder {
+ fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let mut offsets_buffer = MutableBuffer::with_capacity(
+ (item_capacity + 1) * std::mem::size_of::<i32>(),
+ );
+ unsafe { offsets_buffer.push_unchecked(0_i32) };
Review Comment:
Is it really necessary to avoid the bounds check for a single offset?
##########
datafusion/physical-expr/src/string_expressions.rs:
##########
@@ -38,75 +40,147 @@ use datafusion_common::{
};
use datafusion_expr::ColumnarValue;
+enum ColumnarValueRef<'a> {
+ Scalar(&'a [u8]),
+ NullableArray(&'a StringArray),
+ NonNullableArray(&'a StringArray),
+}
+
+impl<'a> ColumnarValueRef<'a> {
+ #[inline]
+ fn is_valid(&self, i: usize) -> bool {
+ match &self {
+ Self::Scalar(_) | Self::NonNullableArray(_) => true,
+ Self::NullableArray(array) => array.is_valid(i),
+ }
+ }
+
+ #[inline]
+ fn nulls(&self) -> Option<NullBuffer> {
+ match &self {
+ Self::Scalar(_) | Self::NonNullableArray(_) => None,
+ Self::NullableArray(array) => array.nulls().cloned(),
+ }
+ }
+}
+
+struct StringArrayBuilder {
Review Comment:
```suggestion
/// Optimized version of the StringBuilder in Arrow that:
/// 1. Precalculating the expected length of the result, avoiding
reallocations.
/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
struct StringArrayBuilder {
```
##########
datafusion/physical-expr/src/string_expressions.rs:
##########
@@ -38,75 +40,147 @@ use datafusion_common::{
};
use datafusion_expr::ColumnarValue;
+enum ColumnarValueRef<'a> {
+ Scalar(&'a [u8]),
+ NullableArray(&'a StringArray),
+ NonNullableArray(&'a StringArray),
+}
+
+impl<'a> ColumnarValueRef<'a> {
+ #[inline]
+ fn is_valid(&self, i: usize) -> bool {
+ match &self {
+ Self::Scalar(_) | Self::NonNullableArray(_) => true,
+ Self::NullableArray(array) => array.is_valid(i),
+ }
+ }
+
+ #[inline]
+ fn nulls(&self) -> Option<NullBuffer> {
+ match &self {
+ Self::Scalar(_) | Self::NonNullableArray(_) => None,
+ Self::NullableArray(array) => array.nulls().cloned(),
+ }
+ }
+}
+
+struct StringArrayBuilder {
+ offsets_buffer: MutableBuffer,
+ value_buffer: MutableBuffer,
+}
+
+impl StringArrayBuilder {
+ fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let mut offsets_buffer = MutableBuffer::with_capacity(
+ (item_capacity + 1) * std::mem::size_of::<i32>(),
+ );
+ unsafe { offsets_buffer.push_unchecked(0_i32) };
+ Self {
+ offsets_buffer,
+ value_buffer: MutableBuffer::with_capacity(data_capacity),
+ }
+ }
+
+ fn write<const CHECK_VALID: bool>(&mut self, column: &ColumnarValueRef, i:
usize) {
+ match column {
+ ColumnarValueRef::Scalar(s) => {
+ self.value_buffer.extend_from_slice(s);
+ }
+ ColumnarValueRef::NullableArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ ColumnarValueRef::NonNullableArray(array) => {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ }
+
+ fn append_offset(&mut self) {
+ let next_offset: i32 = self
+ .value_buffer
+ .len()
+ .try_into()
+ .expect("byte array offset overflow");
+ unsafe { self.offsets_buffer.push_unchecked(next_offset) };
+ }
+
+ fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
+ let array_builder = ArrayDataBuilder::new(DataType::Utf8)
+ .len(self.offsets_buffer.len() / std::mem::size_of::<i32>() - 1)
+ .add_buffer(self.offsets_buffer.into())
+ .add_buffer(self.value_buffer.into())
+ .nulls(null_buffer);
+ let array_data = unsafe { array_builder.build_unchecked() };
Review Comment:
```suggestion
// SAFETY: all data that was appended was valid UTF8 and the values
// and offsets were created correctly
let array_data = unsafe { array_builder.build_unchecked() };
```
##########
datafusion/physical-expr/src/string_expressions.rs:
##########
@@ -38,75 +40,147 @@ use datafusion_common::{
};
use datafusion_expr::ColumnarValue;
+enum ColumnarValueRef<'a> {
+ Scalar(&'a [u8]),
+ NullableArray(&'a StringArray),
+ NonNullableArray(&'a StringArray),
+}
+
+impl<'a> ColumnarValueRef<'a> {
+ #[inline]
+ fn is_valid(&self, i: usize) -> bool {
+ match &self {
+ Self::Scalar(_) | Self::NonNullableArray(_) => true,
+ Self::NullableArray(array) => array.is_valid(i),
+ }
+ }
+
+ #[inline]
+ fn nulls(&self) -> Option<NullBuffer> {
+ match &self {
+ Self::Scalar(_) | Self::NonNullableArray(_) => None,
+ Self::NullableArray(array) => array.nulls().cloned(),
+ }
+ }
+}
+
+struct StringArrayBuilder {
+ offsets_buffer: MutableBuffer,
+ value_buffer: MutableBuffer,
+}
+
+impl StringArrayBuilder {
+ fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
+ let mut offsets_buffer = MutableBuffer::with_capacity(
+ (item_capacity + 1) * std::mem::size_of::<i32>(),
+ );
+ unsafe { offsets_buffer.push_unchecked(0_i32) };
+ Self {
+ offsets_buffer,
+ value_buffer: MutableBuffer::with_capacity(data_capacity),
+ }
+ }
+
+ fn write<const CHECK_VALID: bool>(&mut self, column: &ColumnarValueRef, i:
usize) {
+ match column {
+ ColumnarValueRef::Scalar(s) => {
+ self.value_buffer.extend_from_slice(s);
+ }
+ ColumnarValueRef::NullableArray(array) => {
+ if !CHECK_VALID || array.is_valid(i) {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ ColumnarValueRef::NonNullableArray(array) => {
+ self.value_buffer
+ .extend_from_slice(array.value(i).as_bytes());
+ }
+ }
+ }
+
+ fn append_offset(&mut self) {
+ let next_offset: i32 = self
+ .value_buffer
+ .len()
+ .try_into()
+ .expect("byte array offset overflow");
+ unsafe { self.offsets_buffer.push_unchecked(next_offset) };
+ }
+
+ fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
Review Comment:
I was trying to think if there was some way to create an invalid result with
this API and I think the answer is No (even if append_offset wasn't called ever
the offsets would still be valid.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]