alamb commented on code in PR #6870:
URL: https://github.com/apache/arrow-rs/pull/6870#discussion_r1882440700
##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
#[test]
fn test_increment_utf8() {
+ let test_inc = |o: &str, expected: &str| {
+ if let Ok(v) =
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+ // Got the expected result...
+ assert_eq!(v, expected);
+ // and it's greater than the original string
+ assert!(*v > *o);
+ // Also show that BinaryArray level comparison works here
+ let mut greater = ByteArray::new();
+ greater.set_data(Bytes::from(v));
+ let mut original = ByteArray::new();
+ original.set_data(Bytes::from(o.as_bytes().to_vec()));
+ assert!(greater > original);
+ } else {
+ panic!("Expected incremented UTF8 string to also be valid.");
+ }
+ };
+
// Basic ASCII case
- let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
- assert_eq!(&v, "hellp".as_bytes());
+ test_inc("hello", "hellp");
+
+ // 1-byte ending in max 1-byte
+ test_inc("a\u{7f}", "b");
- // Also show that BinaryArray level comparison works here
- let mut greater = ByteArray::new();
- greater.set_data(Bytes::from(v));
- let mut original = ByteArray::new();
- original.set_data(Bytes::from("hello".as_bytes().to_vec()));
- assert!(greater > original);
+ // 1-byte max should not truncate as it would need 2-byte code points
+ assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none());
// UTF8 string
- let s = "โค๏ธ๐งก๐๐๐๐";
- let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+ test_inc("โค๏ธ๐งก๐๐๐๐", "โค๏ธ๐งก๐๐๐๐");
- if let Ok(new) = String::from_utf8(v) {
- assert_ne!(&new, s);
- assert_eq!(new, "โค๏ธ๐งก๐๐๐๐");
- assert!(new.as_bytes().last().unwrap() >
s.as_bytes().last().unwrap());
- } else {
- panic!("Expected incremented UTF8 string to also be valid.")
- }
+ // 2-byte without overflow
+ test_inc("รฉรฉรฉรฉ", "รฉรฉรฉรช");
- // Max UTF8 character - should be a No-Op
- let s = char::MAX.to_string();
- assert_eq!(s.len(), 4);
- let v = increment_utf8(s.as_bytes().to_vec());
- assert!(v.is_none());
+ // 2-byte that overflows lowest byte
+ test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");
+
+ // 2-byte ending in max 2-byte
+ test_inc("a\u{7ff}", "b");
+
+ // Max 2-byte should not truncate as it would need 3-byte code points
+
assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none());
+
+ // 3-byte without overflow
+ test_inc("เ เ ", "เ เ ");
Review Comment:
Maybe just a comment to help the next uneducated reader ๐
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]