neilconway commented on code in PR #20305:
URL: https://github.com/apache/datafusion/pull/20305#discussion_r2818178704
##########
datafusion/functions/src/unicode/translate.rs:
##########
@@ -199,6 +267,99 @@ where
Ok(Arc::new(result) as ArrayRef)
}
+/// Sentinel value in the ASCII translate table indicating the character should
+/// be deleted (the `from` character has no corresponding `to` character). Any
+/// value > 127 works since valid ASCII is 0–127.
+const ASCII_DELETE: u8 = 0xFF;
+
+/// If `from` and `to` are both ASCII, build a fixed-size lookup table for
+/// translation. Each entry maps an input byte to its replacement byte, or to
+/// [`ASCII_DELETE`] if the character should be removed. Returns `None` if
+/// either string contains non-ASCII characters.
+fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
+ if !from.is_ascii() || !to.is_ascii() {
+ return None;
+ }
+ let mut table = [0u8; 128];
+ for i in 0..128u8 {
+ table[i as usize] = i;
+ }
+ let to_bytes = to.as_bytes();
+ let mut seen = [false; 128];
+ for (i, from_byte) in from.bytes().enumerate() {
+ let idx = from_byte as usize;
+ if !seen[idx] {
+ seen[idx] = true;
+ if i < to_bytes.len() {
+ table[idx] = to_bytes[i];
+ } else {
+ table[idx] = ASCII_DELETE;
+ }
+ }
+ }
+ Some(table)
+}
+
+/// Optimized translate for constant `from` and `to` arguments: uses a
pre-built
+/// translation map instead of rebuilding it for every row. When an ASCII byte
+/// lookup table is provided, ASCII input rows the lookup table; non-ASCII
+/// inputs fallback to using the map.
+fn translate_with_map<'a, T: OffsetSizeTrait, V>(
+ string_array: V,
+ from_map: &HashMap<&str, usize>,
+ to_graphemes: &[&str],
+ ascii_table: Option<&[u8; 128]>,
+) -> Result<ArrayRef>
+where
+ V: ArrayAccessor<Item = &'a str>,
+{
+ let mut string_graphemes: Vec<&str> = Vec::new();
+ let mut result_graphemes: Vec<&str> = Vec::new();
+ let mut ascii_buf: Vec<u8> = Vec::new();
+
+ let result = ArrayIter::new(string_array)
+ .map(|string| {
+ string.map(|s| {
+ // Fast path: byte-level table lookup for ASCII strings
+ if let Some(table) = ascii_table
+ && s.is_ascii()
+ {
+ ascii_buf.clear();
+ for &b in s.as_bytes() {
+ let mapped = table[b as usize];
+ if mapped != ASCII_DELETE {
+ ascii_buf.push(mapped);
+ }
+ }
+ // ascii_buf contains only ASCII bytes, so it is valid
+ // UTF-8.
+ return String::from_utf8(ascii_buf.clone()).unwrap();
Review Comment:
Good catch!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]