AdamGS commented on code in PR #4389:
URL: https://github.com/apache/arrow-rs/pull/4389#discussion_r1225238086


##########
parquet/src/column/writer/mod.rs:
##########
@@ -1152,6 +1181,56 @@ fn compare_greater_byte_array_decimals(a: &[u8], b: 
&[u8]) -> bool {
     (a[1..]) > (b[1..])
 }
 
+/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8 
string, while being less than `max_len` bytes.
+fn truncate_utf8(data: &str, max_len: usize) -> Vec<u8> {
+    let mut max_possible_len = usize::min(data.len(), max_len);
+
+    if data.is_char_boundary(max_possible_len) {
+        return data.as_bytes()[0..max_possible_len].to_vec();
+    }
+
+    // UTF8 characters can only be up to 4 bytes long, so this loop has will 
only run up to 3 times before returning.
+    loop {
+        max_possible_len -= 1;
+        if data.is_char_boundary(max_possible_len) {
+            return data.as_bytes()[0..max_possible_len].to_vec();
+        }
+    }
+}
+
+/// Truncate a binary slice to make sure its length is less than `max_len`
+fn truncate_binary(data: &[u8], max_len: usize) -> Vec<u8> {
+    data[0..usize::min(data.len(), max_len)].to_vec()
+}
+
+/// Try and increment the bytes from right to left.
+fn increment(data: &mut [u8]) {
+    for byte in data.iter_mut().rev() {
+        if *byte == u8::MAX {
+            continue;
+        } else {
+            *byte += 1;
+            break;
+        }
+    }
+}
+
+/// Try and increment the the string's bytes from right to left, returning 
when the result is a valid UTF8 string.
+fn increment_utf8(data: &mut Vec<u8>) {
+    for idx in (0..data.len()).rev() {
+        let byte = &mut data[idx];

Review Comment:
   Found at least on case where this approach doesn't seem to yield a valid 
UTF8 string (`"a\u{10ffff}"`, which is `"a"` followed by a `char::MAX`). Ill 
try an implementation that is closer to the `parquet-mr` one.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to