Re: [PR] Fix some edge cases in UTF-8 incrementing [arrow-rs]

via GitHub Thu, 12 Dec 2024 05:41:55 -0800


alamb commented on code in PR #6870:
URL: https://github.com/apache/arrow-rs/pull/6870#discussion_r1882098814



##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
 
     #[test]
     fn test_increment_utf8() {
+        let test_inc = |o: &str, expected: &str| {
+            if let Ok(v) = 
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+                // Got the expected result...
+                assert_eq!(v, expected);
+                // and it's greater than the original string
+                assert!(*v > *o);
+                // Also show that BinaryArray level comparison works here
+                let mut greater = ByteArray::new();
+                greater.set_data(Bytes::from(v));
+                let mut original = ByteArray::new();
+                original.set_data(Bytes::from(o.as_bytes().to_vec()));
+                assert!(greater > original);
+            } else {
+                panic!("Expected incremented UTF8 string to also be valid.");
+            }
+        };
+
         // Basic ASCII case
-        let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
-        assert_eq!(&v, "hellp".as_bytes());
+        test_inc("hello", "hellp");
+
+        // 1-byte ending in max 1-byte
+        test_inc("a\u{7f}", "b");
 
-        // Also show that BinaryArray level comparison works here
-        let mut greater = ByteArray::new();
-        greater.set_data(Bytes::from(v));
-        let mut original = ByteArray::new();
-        original.set_data(Bytes::from("hello".as_bytes().to_vec()));
-        assert!(greater > original);
+        // 1-byte max should not truncate as it would need 2-byte code points
+        assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none());
 
         // UTF8 string
-        let s = "❤️🧡💛💚💙💜";
-        let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+        test_inc("❤️🧡💛💚💙💜", "❤️🧡💛💚💙💝");
 
-        if let Ok(new) = String::from_utf8(v) {
-            assert_ne!(&new, s);
-            assert_eq!(new, "❤️🧡💛💚💙💝");
-            assert!(new.as_bytes().last().unwrap() > 
s.as_bytes().last().unwrap());
-        } else {
-            panic!("Expected incremented UTF8 string to also be valid.")
-        }
+        // 2-byte without overflow
+        test_inc("éééé", "éééê");
 
-        // Max UTF8 character - should be a No-Op
-        let s = char::MAX.to_string();
-        assert_eq!(s.len(), 4);
-        let v = increment_utf8(s.as_bytes().to_vec());
-        assert!(v.is_none());
+        // 2-byte that overflows lowest byte
+        test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");
+
+        // 2-byte ending in max 2-byte
+        test_inc("a\u{7ff}", "b");
+
+        // Max 2-byte should not truncate as it would need 3-byte code points
+        
assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none());
+
+        // 3-byte without overflow
+        test_inc("ࠀࠀ", "ࠀࠁ");

Review Comment:
   This might be related to how this renders, but it seems strange to me that 
the first character is incremented ratehr than the last
   
   In other words, I would have expected it like
   ```suggestion
           test_inc("ࠂࠁ", "ࠀࠁ");
   ```
   
   If I am reading the internet right 🤔 
   
   
   https://unicodes.jessetane.com/%E0%A0%82
   
   https://unicodes.jessetane.com/%E0%A0%80



##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
 
     #[test]
     fn test_increment_utf8() {
+        let test_inc = |o: &str, expected: &str| {
+            if let Ok(v) = 
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+                // Got the expected result...
+                assert_eq!(v, expected);
+                // and it's greater than the original string
+                assert!(*v > *o);
+                // Also show that BinaryArray level comparison works here
+                let mut greater = ByteArray::new();
+                greater.set_data(Bytes::from(v));
+                let mut original = ByteArray::new();
+                original.set_data(Bytes::from(o.as_bytes().to_vec()));
+                assert!(greater > original);

Review Comment:
   100% for verifying this property
   



##########
parquet/src/column/writer/mod.rs:
##########
@@ -1444,15 +1444,31 @@ fn increment(mut data: Vec<u8>) -> Option<Vec<u8>> {
 /// Try and increment the the string's bytes from right to left, returning 
when the result
 /// is a valid UTF8 string. Returns `None` when it can't increment any byte.
 fn increment_utf8(mut data: Vec<u8>) -> Option<Vec<u8>> {
+    const UTF8_CONTINUATION: u8 = 0x80;
+    const UTF8_CONTINUATION_MASK: u8 = 0xc0;
+
+    let mut len = data.len();
     for idx in (0..data.len()).rev() {
         let original = data[idx];
         let (byte, overflow) = original.overflowing_add(1);
         if !overflow {
             data[idx] = byte;
             if str::from_utf8(&data).is_ok() {

Review Comment:
   we can probably make this much faster by not revalidating the entire string 
(we only have to look back and find where the utf8 boundary starts



##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
 
     #[test]
     fn test_increment_utf8() {
+        let test_inc = |o: &str, expected: &str| {
+            if let Ok(v) = 
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+                // Got the expected result...
+                assert_eq!(v, expected);
+                // and it's greater than the original string
+                assert!(*v > *o);
+                // Also show that BinaryArray level comparison works here
+                let mut greater = ByteArray::new();
+                greater.set_data(Bytes::from(v));
+                let mut original = ByteArray::new();
+                original.set_data(Bytes::from(o.as_bytes().to_vec()));
+                assert!(greater > original);
+            } else {
+                panic!("Expected incremented UTF8 string to also be valid.");
+            }
+        };
+
         // Basic ASCII case
-        let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
-        assert_eq!(&v, "hellp".as_bytes());
+        test_inc("hello", "hellp");
+
+        // 1-byte ending in max 1-byte
+        test_inc("a\u{7f}", "b");
 
-        // Also show that BinaryArray level comparison works here
-        let mut greater = ByteArray::new();
-        greater.set_data(Bytes::from(v));
-        let mut original = ByteArray::new();
-        original.set_data(Bytes::from("hello".as_bytes().to_vec()));
-        assert!(greater > original);
+        // 1-byte max should not truncate as it would need 2-byte code points
+        assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none());
 
         // UTF8 string
-        let s = "❤️🧡💛💚💙💜";
-        let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+        test_inc("❤️🧡💛💚💙💜", "❤️🧡💛💚💙💝");
 
-        if let Ok(new) = String::from_utf8(v) {
-            assert_ne!(&new, s);
-            assert_eq!(new, "❤️🧡💛💚💙💝");
-            assert!(new.as_bytes().last().unwrap() > 
s.as_bytes().last().unwrap());
-        } else {
-            panic!("Expected incremented UTF8 string to also be valid.")
-        }
+        // 2-byte without overflow
+        test_inc("éééé", "éééê");
 
-        // Max UTF8 character - should be a No-Op
-        let s = char::MAX.to_string();
-        assert_eq!(s.len(), 4);
-        let v = increment_utf8(s.as_bytes().to_vec());
-        assert!(v.is_none());
+        // 2-byte that overflows lowest byte
+        test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");

Review Comment:
   I double checked \u the unicode escape syntax: 
https://doc.rust-lang.org/reference/tokens.html#unicode-escapes ✅ 



##########
parquet/src/column/writer/mod.rs:
##########
@@ -1444,15 +1444,31 @@ fn increment(mut data: Vec<u8>) -> Option<Vec<u8>> {
 /// Try and increment the the string's bytes from right to left, returning 
when the result
 /// is a valid UTF8 string. Returns `None` when it can't increment any byte.
 fn increment_utf8(mut data: Vec<u8>) -> Option<Vec<u8>> {
+    const UTF8_CONTINUATION: u8 = 0x80;
+    const UTF8_CONTINUATION_MASK: u8 = 0xc0;
+
+    let mut len = data.len();
     for idx in (0..data.len()).rev() {
         let original = data[idx];
         let (byte, overflow) = original.overflowing_add(1);
         if !overflow {
             data[idx] = byte;
             if str::from_utf8(&data).is_ok() {
+                if len != data.len() {
+                    data.truncate(len);
+                }
                 return Some(data);
             }
-            data[idx] = original;
+            // Incrementing "original" did not yield a valid unicode 
character, so it overflowed
+            // its available bits. If it was a continuation byte (b10xxxxxx) 
then set to min
+            // continuation (b10000000). Otherwise it was the first byte so 
set reset the first
+            // byte back to its original value (so data remains a valid 
string) and reduce "len".
+            if original & UTF8_CONTINUATION_MASK == UTF8_CONTINUATION {

Review Comment:
   Trying to increment the last byte seems strange to me (as the byte my be in 
the middle of a utf8 multi-byte sequence. I don't fully understand the 
implications of setting the last byte to utf8 continuation token. Is that even 
valid UTF8 🤔 
   
   I wonder if we could do the increment operation on the actual unicode value 
rather than one of the bytes in the utf8 sequence? So something like
   - Find the start of the last utf8 character (e.g search from the right for 
the first byte that doesn't have its high bit set)
   - Convert that 1-4 byte sequence into a `char` (unicode)
   - increment the `char`
   - Convert back to utf-8
   - If the converted bytes don't fit at the end of the array, do something 
else (maybe try the next previous character)
   
   That would also save a call to `from_utf8` so it would also likely be faster 
too
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Fix some edge cases in UTF-8 incrementing [arrow-rs]

Reply via email to