alamb commented on code in PR #6870:
URL: https://github.com/apache/arrow-rs/pull/6870#discussion_r1882098814
##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
#[test]
fn test_increment_utf8() {
+ let test_inc = |o: &str, expected: &str| {
+ if let Ok(v) =
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+ // Got the expected result...
+ assert_eq!(v, expected);
+ // and it's greater than the original string
+ assert!(*v > *o);
+ // Also show that BinaryArray level comparison works here
+ let mut greater = ByteArray::new();
+ greater.set_data(Bytes::from(v));
+ let mut original = ByteArray::new();
+ original.set_data(Bytes::from(o.as_bytes().to_vec()));
+ assert!(greater > original);
+ } else {
+ panic!("Expected incremented UTF8 string to also be valid.");
+ }
+ };
+
// Basic ASCII case
- let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
- assert_eq!(&v, "hellp".as_bytes());
+ test_inc("hello", "hellp");
+
+ // 1-byte ending in max 1-byte
+ test_inc("a\u{7f}", "b");
- // Also show that BinaryArray level comparison works here
- let mut greater = ByteArray::new();
- greater.set_data(Bytes::from(v));
- let mut original = ByteArray::new();
- original.set_data(Bytes::from("hello".as_bytes().to_vec()));
- assert!(greater > original);
+ // 1-byte max should not truncate as it would need 2-byte code points
+ assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none());
// UTF8 string
- let s = "โค๏ธ๐งก๐๐๐๐";
- let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+ test_inc("โค๏ธ๐งก๐๐๐๐", "โค๏ธ๐งก๐๐๐๐");
- if let Ok(new) = String::from_utf8(v) {
- assert_ne!(&new, s);
- assert_eq!(new, "โค๏ธ๐งก๐๐๐๐");
- assert!(new.as_bytes().last().unwrap() >
s.as_bytes().last().unwrap());
- } else {
- panic!("Expected incremented UTF8 string to also be valid.")
- }
+ // 2-byte without overflow
+ test_inc("รฉรฉรฉรฉ", "รฉรฉรฉรช");
- // Max UTF8 character - should be a No-Op
- let s = char::MAX.to_string();
- assert_eq!(s.len(), 4);
- let v = increment_utf8(s.as_bytes().to_vec());
- assert!(v.is_none());
+ // 2-byte that overflows lowest byte
+ test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");
+
+ // 2-byte ending in max 2-byte
+ test_inc("a\u{7ff}", "b");
+
+ // Max 2-byte should not truncate as it would need 3-byte code points
+
assert!(increment_utf8("\u{7ff}\u{7ff}".as_bytes().to_vec()).is_none());
+
+ // 3-byte without overflow
+ test_inc("เ เ ", "เ เ ");
Review Comment:
This might be related to how this renders, but it seems strange to me that
the first character is incremented ratehr than the last
In other words, I would have expected it like
```suggestion
test_inc("เ เ ", "เ เ ");
```
If I am reading the internet right ๐ค
https://unicodes.jessetane.com/%E0%A0%82
https://unicodes.jessetane.com/%E0%A0%80
##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
#[test]
fn test_increment_utf8() {
+ let test_inc = |o: &str, expected: &str| {
+ if let Ok(v) =
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+ // Got the expected result...
+ assert_eq!(v, expected);
+ // and it's greater than the original string
+ assert!(*v > *o);
+ // Also show that BinaryArray level comparison works here
+ let mut greater = ByteArray::new();
+ greater.set_data(Bytes::from(v));
+ let mut original = ByteArray::new();
+ original.set_data(Bytes::from(o.as_bytes().to_vec()));
+ assert!(greater > original);
Review Comment:
100% for verifying this property
##########
parquet/src/column/writer/mod.rs:
##########
@@ -1444,15 +1444,31 @@ fn increment(mut data: Vec<u8>) -> Option<Vec<u8>> {
/// Try and increment the the string's bytes from right to left, returning
when the result
/// is a valid UTF8 string. Returns `None` when it can't increment any byte.
fn increment_utf8(mut data: Vec<u8>) -> Option<Vec<u8>> {
+ const UTF8_CONTINUATION: u8 = 0x80;
+ const UTF8_CONTINUATION_MASK: u8 = 0xc0;
+
+ let mut len = data.len();
for idx in (0..data.len()).rev() {
let original = data[idx];
let (byte, overflow) = original.overflowing_add(1);
if !overflow {
data[idx] = byte;
if str::from_utf8(&data).is_ok() {
Review Comment:
we can probably make this much faster by not revalidating the entire string
(we only have to look back and find where the utf8 boundary starts
##########
parquet/src/column/writer/mod.rs:
##########
@@ -3140,39 +3157,67 @@ mod tests {
#[test]
fn test_increment_utf8() {
+ let test_inc = |o: &str, expected: &str| {
+ if let Ok(v) =
String::from_utf8(increment_utf8(o.as_bytes().to_vec()).unwrap()) {
+ // Got the expected result...
+ assert_eq!(v, expected);
+ // and it's greater than the original string
+ assert!(*v > *o);
+ // Also show that BinaryArray level comparison works here
+ let mut greater = ByteArray::new();
+ greater.set_data(Bytes::from(v));
+ let mut original = ByteArray::new();
+ original.set_data(Bytes::from(o.as_bytes().to_vec()));
+ assert!(greater > original);
+ } else {
+ panic!("Expected incremented UTF8 string to also be valid.");
+ }
+ };
+
// Basic ASCII case
- let v = increment_utf8("hello".as_bytes().to_vec()).unwrap();
- assert_eq!(&v, "hellp".as_bytes());
+ test_inc("hello", "hellp");
+
+ // 1-byte ending in max 1-byte
+ test_inc("a\u{7f}", "b");
- // Also show that BinaryArray level comparison works here
- let mut greater = ByteArray::new();
- greater.set_data(Bytes::from(v));
- let mut original = ByteArray::new();
- original.set_data(Bytes::from("hello".as_bytes().to_vec()));
- assert!(greater > original);
+ // 1-byte max should not truncate as it would need 2-byte code points
+ assert!(increment_utf8("\u{7f}\u{7f}".as_bytes().to_vec()).is_none());
// UTF8 string
- let s = "โค๏ธ๐งก๐๐๐๐";
- let v = increment_utf8(s.as_bytes().to_vec()).unwrap();
+ test_inc("โค๏ธ๐งก๐๐๐๐", "โค๏ธ๐งก๐๐๐๐");
- if let Ok(new) = String::from_utf8(v) {
- assert_ne!(&new, s);
- assert_eq!(new, "โค๏ธ๐งก๐๐๐๐");
- assert!(new.as_bytes().last().unwrap() >
s.as_bytes().last().unwrap());
- } else {
- panic!("Expected incremented UTF8 string to also be valid.")
- }
+ // 2-byte without overflow
+ test_inc("รฉรฉรฉรฉ", "รฉรฉรฉรช");
- // Max UTF8 character - should be a No-Op
- let s = char::MAX.to_string();
- assert_eq!(s.len(), 4);
- let v = increment_utf8(s.as_bytes().to_vec());
- assert!(v.is_none());
+ // 2-byte that overflows lowest byte
+ test_inc("\u{ff}\u{ff}", "\u{ff}\u{100}");
Review Comment:
I double checked \u the unicode escape syntax:
https://doc.rust-lang.org/reference/tokens.html#unicode-escapes โ
##########
parquet/src/column/writer/mod.rs:
##########
@@ -1444,15 +1444,31 @@ fn increment(mut data: Vec<u8>) -> Option<Vec<u8>> {
/// Try and increment the the string's bytes from right to left, returning
when the result
/// is a valid UTF8 string. Returns `None` when it can't increment any byte.
fn increment_utf8(mut data: Vec<u8>) -> Option<Vec<u8>> {
+ const UTF8_CONTINUATION: u8 = 0x80;
+ const UTF8_CONTINUATION_MASK: u8 = 0xc0;
+
+ let mut len = data.len();
for idx in (0..data.len()).rev() {
let original = data[idx];
let (byte, overflow) = original.overflowing_add(1);
if !overflow {
data[idx] = byte;
if str::from_utf8(&data).is_ok() {
+ if len != data.len() {
+ data.truncate(len);
+ }
return Some(data);
}
- data[idx] = original;
+ // Incrementing "original" did not yield a valid unicode
character, so it overflowed
+ // its available bits. If it was a continuation byte (b10xxxxxx)
then set to min
+ // continuation (b10000000). Otherwise it was the first byte so
set reset the first
+ // byte back to its original value (so data remains a valid
string) and reduce "len".
+ if original & UTF8_CONTINUATION_MASK == UTF8_CONTINUATION {
Review Comment:
Trying to increment the last byte seems strange to me (as the byte my be in
the middle of a utf8 multi-byte sequence. I don't fully understand the
implications of setting the last byte to utf8 continuation token. Is that even
valid UTF8 ๐ค
I wonder if we could do the increment operation on the actual unicode value
rather than one of the bytes in the utf8 sequence? So something like
- Find the start of the last utf8 character (e.g search from the right for
the first byte that doesn't have its high bit set)
- Convert that 1-4 byte sequence into a `char` (unicode)
- increment the `char`
- Convert back to utf-8
- If the converted bytes don't fit at the end of the array, do something
else (maybe try the next previous character)
That would also save a call to `from_utf8` so it would also likely be faster
too
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]