etseidl commented on code in PR #6870:
URL: https://github.com/apache/arrow-rs/pull/6870#discussion_r1882392270


##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
 
     #[test]
     fn test_increment_utf8() {
+        let test_inc = |o: &str, expected: &str| {
+            if let Ok(v) = 
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+                // Got the expected result...
+                assert_eq!(v, expected);
+                // and it's greater than the original string
+                assert!(*v > *o);
+                // Also show that BinaryArray level comparison works here
+                let mut greater = ByteArray::new();
+                greater.set_data(Bytes::from(v));
+                let mut original = ByteArray::new();
+                original.set_data(Bytes::from(o.as_bytes().to_vec()));
+                assert!(greater > original);
+            } else {
+                panic!("Expected incremented UTF8 string to also be valid.");
+            }
+        };
+
         // Basic ASCII case
-        let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
-        assert_eq!(&v, "hellp".as_bytes());
+        test_inc("hello", "hellp");
+
+        // 1-byte ending in max 1-byte
+        test_inc("a\u{7f}", "b");
 
-        // Also show that BinaryArray level comparison works here
-        let mut greater = ByteArray::new();
-        greater.set_data(Bytes::from(v));
-        let mut original = ByteArray::new();
-        original.set_data(Bytes::from("hello".as_bytes().to_vec()));
-        assert!(greater > original);
+        // 1-byte max should not truncate as it would need 2-byte code points
+        assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none());
 
         // UTF8 string
-        let s = "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ";
-        let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+        test_inc("โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’œ", "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’");
 
-        if let Ok(new) = String::from_utf8(v) {
-            assert_ne!(&new, s);
-            assert_eq!(new, "โค๏ธ๐Ÿงก๐Ÿ’›๐Ÿ’š๐Ÿ’™๐Ÿ’");
-            assert!(new.as_bytes().last().unwrap() > 
s.as_bytes().last().unwrap());
-        } else {
-            panic!("Expected incremented UTF8 string to also be valid.")
-        }
+        // 2-byte without overflow
+        test_inc("รฉรฉรฉรฉ", "รฉรฉรฉรช");
 
-        // Max UTF8 character - should be a No-Op
-        let s = char::MAX.to_string();
-        assert_eq!(s.len(), 4);
-        let v = increment_utf8(s.as_bytes().to_vec());
-        assert!(v.is_none());
+        // 2-byte that overflows lowest byte
+        test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");
+
+        // 2-byte ending in max 2-byte
+        test_inc("a\u{7ff}", "b");
+
+        // Max 2-byte should not truncate as it would need 3-byte code points
+        
assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none());
+
+        // 3-byte without overflow
+        test_inc("เ €เ €", "เ €เ ");

Review Comment:
   It's a chart set that renders right to left ๐Ÿ˜…. Fun hitting delete and seeing 
which char disappears. We could change to escapes.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to