neilconway commented on code in PR #21238:
URL: https://github.com/apache/datafusion/pull/21238#discussion_r3031035588
##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -220,6 +231,190 @@ fn rsplit_nth<'a>(string: &'a str, delimiter: &str, n:
usize) -> Option<&'a str>
}
}
+/// Fast path for `split_part(array, scalar_delimiter, scalar_position)`.
+fn split_part_scalar(
+ string_array: &ArrayRef,
+ delim_scalar: &ScalarValue,
+ pos_scalar: &ScalarValue,
+) -> Result<ColumnarValue> {
+ let delimiter = delim_scalar.try_as_str().ok_or_else(|| {
+ exec_datafusion_err!(
+ "Unsupported delimiter type {:?} for split_part",
+ delim_scalar.data_type()
+ )
+ })?;
+
+ let position = match pos_scalar {
+ ScalarValue::Int64(v) => *v,
+ other => {
+ return exec_err!(
+ "Unsupported position type {:?} for split_part",
+ other.data_type()
+ );
+ }
+ };
+
+ if position == Some(0) {
+ return exec_err!("field position must not be zero");
+ }
+
+ // Null delimiter or position → every row is null.
+ let (Some(delimiter), Some(position)) = (delimiter, position) else {
+ return Ok(ColumnarValue::Array(new_null_array(
+ string_array.data_type(),
+ string_array.len(),
+ )));
+ };
+
+ let result = match string_array.data_type() {
+ DataType::Utf8View => split_part_scalar_impl(
+ string_array.as_string_view(),
+ delimiter,
+ position,
+ StringViewBuilder::with_capacity(string_array.len()),
+ ),
+ DataType::Utf8 => {
+ let arr = string_array.as_string::<i32>();
+ split_part_scalar_impl(
+ arr,
+ delimiter,
+ position,
+ GenericStringBuilder::<i32>::with_capacity(
+ arr.len(),
+ arr.value_data().len(),
Review Comment:
The slow path uses the same code. On looking closer, I think the more
serious issue is that we might seriously over-allocate when `split_part` is
called to return a small fragment of a much larger string (which is fairly
common).
Not sure the best approach here. `arr.len()` (for the data capacity) is
better than nothing and at least it won't over-allocate... what do you think?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]