martin-g commented on code in PR #20278:
URL: https://github.com/apache/datafusion/pull/20278#discussion_r2816618201
##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -266,17 +292,28 @@ where
continue;
}
- // Reuse buffer by clearing and refilling
- graphemes_buf.clear();
- graphemes_buf.extend(string.graphemes(true));
-
- if length < graphemes_buf.len() {
- builder.append_value(graphemes_buf[..length].concat());
+ if string.is_ascii() {
+ // ASCII fast path: byte length == character length
+ let str_len = string.len();
+ if length < str_len {
Review Comment:
```suggestion
if length == str_len {
builder.append_value(string);
} else if length < str_len {
```
this would avoid `builder.write_str(" ".repeat(length - str_len).as_str())?;`
##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -273,27 +287,49 @@ where
);
}
let length = if length < 0 { 0 } else { length
as usize };
- // Reuse buffer by clearing and refilling
- graphemes_buf.clear();
- graphemes_buf.extend(string.graphemes(true));
-
- if length < graphemes_buf.len() {
- builder
-
.append_value(graphemes_buf[..length].concat());
- } else if fill.is_empty() {
- builder.append_value(string);
+ if string.is_ascii() && fill.is_ascii() {
+ // ASCII fast path: byte length ==
character length,
+ // so we skip expensive grapheme
segmentation.
+ let str_len = string.len();
+ if length < str_len {
+
builder.append_value(&string[..length]);
+ } else if fill.is_empty() {
+ builder.append_value(string);
+ } else {
+ let pad_len = length - str_len;
+ let fill_len = fill.len();
+ let full_reps = pad_len / fill_len;
+ let remainder = pad_len % fill_len;
+ builder.write_str(string)?;
+ for _ in 0..full_reps {
+ builder.write_str(fill)?;
+ }
+
builder.append_value(&fill[..remainder]);
Review Comment:
```suggestion
builder.append_value(&fill[..remainder])?;
if remainder > 0 {
builder.write_str(&fill[..remainder])?;
}
```
as at
https://github.com/apache/datafusion/pull/20278/changes#diff-4f47738c719ccd61cdf07e6eaee63e914ad552f9f0e87b736c431cfff64cddd2R247
##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -459,6 +495,17 @@ mod tests {
Utf8,
StringArray
);
+ test_function!(
+ RPadFunc::new(),
+ vec![
+ ColumnarValue::Scalar(ScalarValue::from("hello")),
+ ColumnarValue::Scalar(ScalarValue::from(2i64)),
+ ],
+ Ok(Some("he")),
+ &str,
+ Utf8,
+ StringArray
+ );
Review Comment:
```suggestion
);
test_function!(
RPadFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::from("hi")),
ColumnarValue::Scalar(ScalarValue::from(6i64)),
ColumnarValue::Scalar(ScalarValue::from("xy")),
],
Ok(Some("hixyxy")),
&str,
Utf8,
StringArray
);
```
to test `remainder == 0`
##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -234,6 +238,17 @@ where
let length = if length < 0 { 0 } else { length as
usize };
if length == 0 {
builder.append_value("");
+ } else if string.is_ascii() {
+ // ASCII fast path: byte length == character
length
+ let str_len = string.len();
+ if length < str_len {
Review Comment:
```suggestion
if length == str_len {
builder.append_value(string);
} else if length < str_len {
```
this would avoid `builder.write_str(" ".repeat(length - str_len).as_str())?;`
##########
datafusion/functions/benches/pad.rs:
##########
@@ -30,6 +33,51 @@ use std::hint::black_box;
use std::sync::Arc;
use std::time::Duration;
+const UNICODE_STRINGS: &[&str] = &[
+ "Ñandú",
+ "Íslensku",
+ "Þjóðarinnar",
+ "Ελληνική",
+ "Иванович",
+ "データフュージョン",
+ "José García",
+ "Ölçü bïrïmï",
+ "Ÿéšṱëṟḏàÿ",
+ "Ährenstraße",
+];
+
+fn create_unicode_string_array<O: OffsetSizeTrait>(
+ size: usize,
+ null_density: f32,
+) -> arrow::array::GenericStringArray<O> {
+ let mut rng = rand::rng();
+ let mut builder = GenericStringBuilder::<O>::new();
+ for i in 0..size {
+ if rng.random::<f32>() < null_density {
+ builder.append_null();
+ } else {
+ builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
+ }
+ }
+ builder.finish()
+}
+
+fn create_unicode_string_view_array(
Review Comment:
nit: The function is almost identical to `create_unicode_string_array()`.
Maybe merge them into a more generic one ?
##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -273,27 +287,49 @@ where
);
}
let length = if length < 0 { 0 } else { length
as usize };
- // Reuse buffer by clearing and refilling
- graphemes_buf.clear();
- graphemes_buf.extend(string.graphemes(true));
-
- if length < graphemes_buf.len() {
- builder
-
.append_value(graphemes_buf[..length].concat());
- } else if fill.is_empty() {
- builder.append_value(string);
+ if string.is_ascii() && fill.is_ascii() {
+ // ASCII fast path: byte length ==
character length,
+ // so we skip expensive grapheme
segmentation.
+ let str_len = string.len();
+ if length < str_len {
+
builder.append_value(&string[..length]);
+ } else if fill.is_empty() {
+ builder.append_value(string);
+ } else {
+ let pad_len = length - str_len;
+ let fill_len = fill.len();
+ let full_reps = pad_len / fill_len;
+ let remainder = pad_len % fill_len;
+ builder.write_str(string)?;
+ for _ in 0..full_reps {
+ builder.write_str(fill)?;
+ }
+
builder.append_value(&fill[..remainder]);
+ }
} else {
- builder.write_str(string)?;
- // Reuse fill_chars_buf by clearing and
refilling
- fill_chars_buf.clear();
- fill_chars_buf.extend(fill.chars());
- for l in 0..length - graphemes_buf.len() {
- let c = *fill_chars_buf
- .get(l % fill_chars_buf.len())
- .unwrap();
- builder.write_char(c)?;
+ // Reuse buffer by clearing and refilling
+ graphemes_buf.clear();
+
graphemes_buf.extend(string.graphemes(true));
+
+ if length < graphemes_buf.len() {
+ builder.append_value(
+ graphemes_buf[..length].concat(),
+ );
Review Comment:
```suggestion
)?;
```
##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -234,6 +238,17 @@ where
let length = if length < 0 { 0 } else { length as
usize };
if length == 0 {
builder.append_value("");
+ } else if string.is_ascii() {
+ // ASCII fast path: byte length == character
length
+ let str_len = string.len();
+ if length < str_len {
+ builder.append_value(&string[..length]);
+ } else {
+ builder.write_str(string)?;
+ builder.append_value(
+ " ".repeat(length - str_len).as_str(),
+ );
Review Comment:
```suggestion
for _ in 0..(length - str_len) {
builder.write_char(' ')?;
}
```
to avoid the String allocation.
If the bench test shows that it is better then please apply it also in all
other places.
##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -273,27 +287,49 @@ where
);
}
let length = if length < 0 { 0 } else { length
as usize };
- // Reuse buffer by clearing and refilling
- graphemes_buf.clear();
- graphemes_buf.extend(string.graphemes(true));
-
- if length < graphemes_buf.len() {
- builder
-
.append_value(graphemes_buf[..length].concat());
- } else if fill.is_empty() {
- builder.append_value(string);
+ if string.is_ascii() && fill.is_ascii() {
+ // ASCII fast path: byte length ==
character length,
+ // so we skip expensive grapheme
segmentation.
+ let str_len = string.len();
+ if length < str_len {
+
builder.append_value(&string[..length]);
+ } else if fill.is_empty() {
+ builder.append_value(string);
+ } else {
+ let pad_len = length - str_len;
+ let fill_len = fill.len();
+ let full_reps = pad_len / fill_len;
+ let remainder = pad_len % fill_len;
+ builder.write_str(string)?;
+ for _ in 0..full_reps {
+ builder.write_str(fill)?;
+ }
+
builder.append_value(&fill[..remainder]);
+ }
} else {
- builder.write_str(string)?;
- // Reuse fill_chars_buf by clearing and
refilling
- fill_chars_buf.clear();
- fill_chars_buf.extend(fill.chars());
- for l in 0..length - graphemes_buf.len() {
- let c = *fill_chars_buf
- .get(l % fill_chars_buf.len())
- .unwrap();
- builder.write_char(c)?;
+ // Reuse buffer by clearing and refilling
+ graphemes_buf.clear();
+
graphemes_buf.extend(string.graphemes(true));
+
+ if length < graphemes_buf.len() {
+ builder.append_value(
+ graphemes_buf[..length].concat(),
+ );
+ } else if fill.is_empty() {
+ builder.append_value(string);
Review Comment:
```suggestion
builder.append_value(string)?;
```
##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -523,6 +560,11 @@ mod tests {
None,
Ok(None)
);
+ test_lpad!(
+ Some("hello".into()),
+ ScalarValue::Int64(Some(2i64)),
+ Ok(Some("he"))
+ );
Review Comment:
```suggestion
);
test_lpad!(
Some("hi".into()),
ScalarValue::Int64(Some(6i64)),
Some("xy".into()),
Ok(Some("xyxyhi"))
);
```
to test `remainder == 0`
##########
datafusion/functions/src/unicode/rpad.rs:
##########
@@ -273,27 +287,49 @@ where
);
}
let length = if length < 0 { 0 } else { length
as usize };
- // Reuse buffer by clearing and refilling
- graphemes_buf.clear();
- graphemes_buf.extend(string.graphemes(true));
-
- if length < graphemes_buf.len() {
- builder
-
.append_value(graphemes_buf[..length].concat());
- } else if fill.is_empty() {
- builder.append_value(string);
+ if string.is_ascii() && fill.is_ascii() {
+ // ASCII fast path: byte length ==
character length,
+ // so we skip expensive grapheme
segmentation.
+ let str_len = string.len();
+ if length < str_len {
+
builder.append_value(&string[..length]);
+ } else if fill.is_empty() {
+ builder.append_value(string);
+ } else {
+ let pad_len = length - str_len;
+ let fill_len = fill.len();
+ let full_reps = pad_len / fill_len;
+ let remainder = pad_len % fill_len;
+ builder.write_str(string)?;
+ for _ in 0..full_reps {
+ builder.write_str(fill)?;
+ }
+
builder.append_value(&fill[..remainder]);
+ }
} else {
- builder.write_str(string)?;
- // Reuse fill_chars_buf by clearing and
refilling
- fill_chars_buf.clear();
- fill_chars_buf.extend(fill.chars());
- for l in 0..length - graphemes_buf.len() {
- let c = *fill_chars_buf
- .get(l % fill_chars_buf.len())
- .unwrap();
- builder.write_char(c)?;
+ // Reuse buffer by clearing and refilling
+ graphemes_buf.clear();
+
graphemes_buf.extend(string.graphemes(true));
+
+ if length < graphemes_buf.len() {
+ builder.append_value(
+ graphemes_buf[..length].concat(),
+ );
+ } else if fill.is_empty() {
+ builder.append_value(string);
+ } else {
+ builder.write_str(string)?;
+ // Reuse fill_chars_buf by clearing
and refilling
+ fill_chars_buf.clear();
+ fill_chars_buf.extend(fill.chars());
+ for l in 0..length -
graphemes_buf.len() {
+ let c = *fill_chars_buf
+ .get(l % fill_chars_buf.len())
+ .unwrap();
+ builder.write_char(c)?;
+ }
+ builder.append_value("");
Review Comment:
```suggestion
builder.append_value("")?;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]