This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 4ad5c3dab7 perf: Optimize strpos() for ASCII-only inputs (#20295)
4ad5c3dab7 is described below
commit 4ad5c3dab7cbd9349b4c834c2de7afdeedd84989
Author: Neil Conway <[email protected]>
AuthorDate: Fri Feb 13 12:05:14 2026 -0500
perf: Optimize strpos() for ASCII-only inputs (#20295)
The previous implementation had a fast path for ASCII-only inputs, but
it was still relatively slow. Switch to using memchr::memchr() to find
the first matching byte and then check the rest of the bytes by hand.
This improves performance for ASCII inputs by 2x-4x on the built-in
strpos benchmarks.
## Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax. For example
`Closes #123` indicates that this PR will close issue #123.
-->
- Closes #20294.
## Are these changes tested?
Yes, passes unit tests and SLT.
## Are there any user-facing changes?
No.
---
Cargo.lock | 5 ++-
datafusion/functions/Cargo.toml | 1 +
datafusion/functions/benches/strpos.rs | 9 ++--
datafusion/functions/src/unicode/strpos.rs | 68 ++++++++++++++++++------------
4 files changed, 51 insertions(+), 32 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 18e1b8b7f7..7b2568d83c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2239,6 +2239,7 @@ dependencies = [
"itertools 0.14.0",
"log",
"md-5",
+ "memchr",
"num-traits",
"rand 0.9.2",
"regex",
@@ -4009,9 +4010,9 @@ dependencies = [
[[package]]
name = "memchr"
-version = "2.7.6"
+version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
[[package]]
name = "mimalloc"
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 599b85116b..35eddec053 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -82,6 +82,7 @@ hex = { workspace = true, optional = true }
itertools = { workspace = true }
log = { workspace = true }
md-5 = { version = "^0.10.0", optional = true }
+memchr = "2.8.0"
num-traits = { workspace = true }
rand = { workspace = true }
regex = { workspace = true, optional = true }
diff --git a/datafusion/functions/benches/strpos.rs
b/datafusion/functions/benches/strpos.rs
index 8cfbc791c4..94ce919c3d 100644
--- a/datafusion/functions/benches/strpos.rs
+++ b/datafusion/functions/benches/strpos.rs
@@ -27,9 +27,12 @@ use std::hint::black_box;
use std::str::Chars;
use std::sync::Arc;
-/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
-/// 4096 rows, each row containing a string with 128 random characters.
-/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
+/// Returns a `Vec<ColumnarValue>` with two elements: a haystack array and a
+/// needle array. Each haystack is a random string of `str_len_chars`
+/// characters. Each needle is a random contiguous substring of its
+/// corresponding haystack (i.e., the needle is always present in the
haystack).
+/// Around `null_density` fraction of rows are null and `utf8_density` fraction
+/// contain non-ASCII characters; the remaining rows are ASCII-only.
fn gen_string_array(
n_rows: usize,
str_len_chars: usize,
diff --git a/datafusion/functions/src/unicode/strpos.rs
b/datafusion/functions/src/unicode/strpos.rs
index 9be086c4cf..c1d6ecffe5 100644
--- a/datafusion/functions/src/unicode/strpos.rs
+++ b/datafusion/functions/src/unicode/strpos.rs
@@ -32,6 +32,7 @@ use datafusion_expr::{
Volatility,
};
use datafusion_macros::user_doc;
+use memchr::memchr;
#[user_doc(
doc_section(label = "String Functions"),
@@ -179,6 +180,31 @@ fn strpos(args: &[ArrayRef]) -> Result<ArrayRef> {
}
}
+/// Find `needle` in `haystack` using `memchr` to quickly skip to positions
+/// where the first byte matches, then verify the remaining bytes. Using
+/// string::find is slower because it has significant per-call overhead that
+/// `memchr` does not, and strpos is often invoked many times on short inputs.
+/// Returns a 1-based position, or 0 if not found.
+/// Both inputs must be ASCII-only.
+fn find_ascii_substring(haystack: &[u8], needle: &[u8]) -> usize {
+ let needle_len = needle.len();
+ let first_byte = needle[0];
+ let mut offset = 0;
+
+ while let Some(pos) = memchr(first_byte, &haystack[offset..]) {
+ let start = offset + pos;
+ if start + needle_len > haystack.len() {
+ return 0;
+ }
+ if haystack[start..start + needle_len] == *needle {
+ return start + 1;
+ }
+ offset = start + 1;
+ }
+
+ 0
+}
+
/// Returns starting index of specified substring within string, or zero if
it's not present. (Same as position(substring in string), but note the reversed
argument order.)
/// strpos('high', 'ig') = 2
/// The implementation uses UTF-8 code points as characters
@@ -198,37 +224,25 @@ where
.zip(substring_iter)
.map(|(string, substring)| match (string, substring) {
(Some(string), Some(substring)) => {
- // If only ASCII characters are present, we can use the slide
window method to find
- // the sub vector in the main vector. This is faster than
string.find() method.
+ if substring.is_empty() {
+ return T::Native::from_usize(1);
+ }
+
+ let substring_bytes = substring.as_bytes();
+ let string_bytes = string.as_bytes();
+
+ if substring_bytes.len() > string_bytes.len() {
+ return T::Native::from_usize(0);
+ }
+
if ascii_only {
- // If the substring is empty, the result is 1.
- if substring.is_empty() {
- T::Native::from_usize(1)
- } else {
- T::Native::from_usize(
- string
- .as_bytes()
- .windows(substring.len())
- .position(|w| w == substring.as_bytes())
- .map(|x| x + 1)
- .unwrap_or(0),
- )
- }
+ T::Native::from_usize(find_ascii_substring(
+ string_bytes,
+ substring_bytes,
+ ))
} else {
// For non-ASCII, use a single-pass search that tracks both
// byte position and character position simultaneously
- if substring.is_empty() {
- return T::Native::from_usize(1);
- }
-
- let substring_bytes = substring.as_bytes();
- let string_bytes = string.as_bytes();
-
- if substring_bytes.len() > string_bytes.len() {
- return T::Native::from_usize(0);
- }
-
- // Single pass: find substring while counting characters
let mut char_pos = 0;
for (byte_idx, _) in string.char_indices() {
char_pos += 1;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]