Re: [PR] Improvements to UTF-8 statistics truncation [arrow-rs]

via GitHub Fri, 13 Dec 2024 18:08:15 -0800


alamb commented on code in PR #6870:
URL: https://github.com/apache/arrow-rs/pull/6870#discussion_r1884692601



##########
parquet/src/column/writer/mod.rs:
##########
@@ -878,24 +878,44 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, 
E> {
         }
     }
 
+    /// Returns `true` if this column's logical type is a UTF-8 string.
+    fn is_utf8(&self) -> bool {
+        self.get_descriptor().logical_type() == Some(LogicalType::String)
+            || self.get_descriptor().converted_type() == ConvertedType::UTF8
+    }
+
     fn truncate_min_value(&self, truncation_length: Option<usize>, data: 
&[u8]) -> (Vec<u8>, bool) {
         truncation_length
             .filter(|l| data.len() > *l)
-            .and_then(|l| match str::from_utf8(data) {
-                Ok(str_data) => truncate_utf8(str_data, l),
-                Err(_) => Some(data[..l].to_vec()),
-            })
+            .and_then(|l|
+                // don't do extra work if this column isn't UTF-8

Review Comment:
   💯 



##########
parquet/src/column/writer/mod.rs:
##########
@@ -3192,6 +3262,40 @@ mod tests {
         // One multi-byte code point, and a length shorter than it, so we 
can't slice it
         let r = truncate_utf8("\u{0836}", 1);
         assert!(r.is_none());
+
+        // Test truncate and increment for max bounds on UTF-8 statistics
+        // 7-bit (i.e. ASCII)
+        let r = truncate_and_increment_utf8("yyyyyyyyy", 8).unwrap();
+        assert_eq!(&r, "yyyyyyyz".as_bytes());
+
+        // 2-byte without overflow
+        let r = truncate_and_increment_utf8("ééééé", 8).unwrap();
+        assert_eq!(&r, "éééê".as_bytes());
+
+        // 2-byte that overflows lowest byte
+        let r = truncate_and_increment_utf8("\u{ff}\u{ff}\u{ff}\u{ff}\u{ff}", 
8).unwrap();
+        assert_eq!(&r, "\u{ff}\u{ff}\u{ff}\u{100}".as_bytes());
+
+        // max 2-byte should not truncate as it would need 3-byte code points
+        let r = truncate_and_increment_utf8("߿߿߿߿߿", 8);
+        assert!(r.is_none());
+
+        // 3-byte without overflow [U+800, U+800, U+800] -> [U+800, U+801] 
(note that these
+        // characters should render right to left).
+        let r = truncate_and_increment_utf8("ࠀࠀࠀ", 8).unwrap();
+        assert_eq!(&r, "ࠀࠁ".as_bytes());
+
+        // max 3-byte should not truncate as it would need 4-byte code points
+        let r = truncate_and_increment_utf8("\u{ffff}\u{ffff}\u{ffff}", 8);
+        assert!(r.is_none());
+
+        // 4-byte without overflow
+        let r = truncate_and_increment_utf8("𐀀𐀀𐀀", 8).unwrap();
+        assert_eq!(&r, "𐀀𐀁".as_bytes());

Review Comment:
   Is there a test for incrementing that doesn't have space for 2 character 
(aka exercise the loop twice)
   
   Maybe something like truncate this to 5 bytes
   ```rust
   truncate_and_increment_utf8("𐀀𐀀𐀀", 5).unwrap();
   ```
   Which could only hold a single 4 byte UTF8 code point



##########
parquet/src/column/writer/mod.rs:
##########
@@ -878,24 +878,44 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, 
E> {
         }
     }
 
+    /// Returns `true` if this column's logical type is a UTF-8 string.
+    fn is_utf8(&self) -> bool {
+        self.get_descriptor().logical_type() == Some(LogicalType::String)
+            || self.get_descriptor().converted_type() == ConvertedType::UTF8
+    }
+
     fn truncate_min_value(&self, truncation_length: Option<usize>, data: 
&[u8]) -> (Vec<u8>, bool) {
         truncation_length
             .filter(|l| data.len() > *l)
-            .and_then(|l| match str::from_utf8(data) {
-                Ok(str_data) => truncate_utf8(str_data, l),
-                Err(_) => Some(data[..l].to_vec()),
-            })
+            .and_then(|l|
+                // don't do extra work if this column isn't UTF-8
+                if self.is_utf8() {
+                    match str::from_utf8(data) {
+                        Ok(str_data) => truncate_utf8(str_data, l),
+                        Err(_) => Some(data[..l].to_vec()),

Review Comment:
   it is a somewhat questionable move to truncate this on invalid data, but I 
see that is wht the code used to do so seems good to me



##########
parquet/src/column/writer/mod.rs:
##########
@@ -1418,13 +1438,51 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: 
&[u8]) -> bool {
     (a[1..]) > (b[1..])
 }
 
-/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 
string,
-/// while being less than `length` bytes and non-empty
+/// Truncate a UTF-8 slice to the longest prefix that is still a valid UTF-8 
string,
+/// while being less than `length` bytes and non-empty. Returns `None` if 
truncation
+/// is not possible within those constraints.
+///
+/// The caller guarantees that data.len() > length.
 fn truncate_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
     let split = (1..=length).rfind(|x| data.is_char_boundary(*x))?;
     Some(data.as_bytes()[..split].to_vec())
 }
 
+/// Truncate a UTF-8 slice and increment it's final character. The returned 
value is the
+/// longest such slice that is still a valid UTF-8 string while being less 
than `length`
+/// bytes and non-empty. Returns `None` if no such transformation is possible.
+///
+/// The caller guarantees that data.len() > length.
+fn truncate_and_increment_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
+    // UTF-8 is max 4 bytes, so start search 3 back from desired length
+    let lower_bound = length.saturating_sub(3);
+    let split = (lower_bound..=length).rfind(|x| data.is_char_boundary(*x))?;
+    increment_utf8(data.get(..split)?)
+}
+
+/// Increment the final character in a UTF-8 string in such a way that the 
returned result
+/// is still a valid UTF-8 string. The returned string may be shorter than the 
input if the
+/// last character(s) cannot be incremented (due to overflow or producing 
invalid code points).
+/// Returns `None` if the string cannot be incremented.
+///
+/// Note that this implementation will not promote an N-byte code point to 
(N+1) bytes.
+fn increment_utf8(data: &str) -> Option<Vec<u8>> {
+    for (idx, code_point) in data.char_indices().rev() {
+        let curr_len = code_point.len_utf8();
+        let original = code_point as u32;

Review Comment:
   pedantic me would likely rename `original` --> `original_char` to mirror the 
naming of `next_char`



##########
parquet/src/column/writer/mod.rs:
##########
@@ -1418,13 +1438,51 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: 
&[u8]) -> bool {
     (a[1..]) > (b[1..])
 }
 
-/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 
string,
-/// while being less than `length` bytes and non-empty
+/// Truncate a UTF-8 slice to the longest prefix that is still a valid UTF-8 
string,
+/// while being less than `length` bytes and non-empty. Returns `None` if 
truncation
+/// is not possible within those constraints.
+///
+/// The caller guarantees that data.len() > length.
 fn truncate_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
     let split = (1..=length).rfind(|x| data.is_char_boundary(*x))?;
     Some(data.as_bytes()[..split].to_vec())
 }
 
+/// Truncate a UTF-8 slice and increment it's final character. The returned 
value is the
+/// longest such slice that is still a valid UTF-8 string while being less 
than `length`
+/// bytes and non-empty. Returns `None` if no such transformation is possible.
+///
+/// The caller guarantees that data.len() > length.
+fn truncate_and_increment_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
+    // UTF-8 is max 4 bytes, so start search 3 back from desired length
+    let lower_bound = length.saturating_sub(3);
+    let split = (lower_bound..=length).rfind(|x| data.is_char_boundary(*x))?;
+    increment_utf8(data.get(..split)?)
+}
+
+/// Increment the final character in a UTF-8 string in such a way that the 
returned result
+/// is still a valid UTF-8 string. The returned string may be shorter than the 
input if the
+/// last character(s) cannot be incremented (due to overflow or producing 
invalid code points).
+/// Returns `None` if the string cannot be incremented.
+///
+/// Note that this implementation will not promote an N-byte code point to 
(N+1) bytes.
+fn increment_utf8(data: &str) -> Option<Vec<u8>> {
+    for (idx, code_point) in data.char_indices().rev() {
+        let curr_len = code_point.len_utf8();
+        let original = code_point as u32;
+        if let Some(next_char) = char::from_u32(original + 1) {
+            // do not allow increasing byte width of incremented char

Review Comment:
   I suppose it is never the case where next_char.len_utf8() is going to be 
shorter than the current length as utf8 encodings of a larger codepoint will 
always be at least as large 🤔 
   
   I guess I am thinking should this be an inequality. However, I think it is 
easy to reason about the invariants if it is know to be equal



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Improvements to UTF-8 statistics truncation [arrow-rs]

Reply via email to