Re: [PR] Fix some edge cases in UTF-8 incrementing [arrow-rs]

via GitHub Thu, 12 Dec 2024 08:06:10 -0800


alamb commented on code in PR #6870:
URL: https://github.com/apache/arrow-rs/pull/6870#discussion_r1882440700



##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
 
     #[test]
     fn test_increment_utf8() {
+        let test_inc = |o: &str, expected: &str| {
+            if let Ok(v) = 
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+                // Got the expected result...
+                assert_eq!(v, expected);
+                // and it's greater than the original string
+                assert!(*v > *o);
+                // Also show that BinaryArray level comparison works here
+                let mut greater = ByteArray::new();
+                greater.set_data(Bytes::from(v));
+                let mut original = ByteArray::new();
+                original.set_data(Bytes::from(o.as_bytes().to_vec()));
+                assert!(greater > original);
+            } else {
+                panic!("Expected incremented UTF8 string to also be valid.");
+            }
+        };
+
         // Basic ASCII case
-        let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
-        assert_eq!(&v, "hellp".as_bytes());
+        test_inc("hello", "hellp");
+
+        // 1-byte ending in max 1-byte
+        test_inc("a\u{7f}", "b");
 
-        // Also show that BinaryArray level comparison works here
-        let mut greater = ByteArray::new();
-        greater.set_data(Bytes::from(v));
-        let mut original = ByteArray::new();
-        original.set_data(Bytes::from("hello".as_bytes().to_vec()));
-        assert!(greater > original);
+        // 1-byte max should not truncate as it would need 2-byte code points
+        assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none());
 
         // UTF8 string
-        let s = "❤️🧡💛💚💙💜";
-        let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+        test_inc("❤️🧡💛💚💙💜", "❤️🧡💛💚💙💝");
 
-        if let Ok(new) = String::from_utf8(v) {
-            assert_ne!(&new, s);
-            assert_eq!(new, "❤️🧡💛💚💙💝");
-            assert!(new.as_bytes().last().unwrap() > 
s.as_bytes().last().unwrap());
-        } else {
-            panic!("Expected incremented UTF8 string to also be valid.")
-        }
+        // 2-byte without overflow
+        test_inc("éééé", "éééê");
 
-        // Max UTF8 character - should be a No-Op
-        let s = char::MAX.to_string();
-        assert_eq!(s.len(), 4);
-        let v = increment_utf8(s.as_bytes().to_vec());
-        assert!(v.is_none());
+        // 2-byte that overflows lowest byte
+        test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");
+
+        // 2-byte ending in max 2-byte
+        test_inc("a\u{7ff}", "b");
+
+        // Max 2-byte should not truncate as it would need 3-byte code points
+        
assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none());
+
+        // 3-byte without overflow
+        test_inc("ࠀࠀ", "ࠀࠁ");

Review Comment:
   Maybe just a comment to help the next uneducated reader 😆 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Fix some edge cases in UTF-8 incrementing [arrow-rs]

Reply via email to