alamb commented on code in PR #6870:
URL: https://github.com/apache/arrow-rs/pull/6870#discussion_r1885862689
##########
parquet/src/column/writer/mod.rs:
##########
@@ -1418,13 +1438,51 @@ fn compare_greater_byte_array_decimals(a: &[u8], b:
&[u8]) -> bool {
(a[1..]) > (b[1..])
}
-/// Truncate a UTF8 slice to the longest prefix that is still a valid UTF8
string,
-/// while being less than `length` bytes and non-empty
+/// Truncate a UTF-8 slice to the longest prefix that is still a valid UTF-8
string,
+/// while being less than `length` bytes and non-empty. Returns `None` if
truncation
+/// is not possible within those constraints.
+///
+/// The caller guarantees that data.len() > length.
fn truncate_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
let split = (1..=length).rfind(|x| data.is_char_boundary(*x))?;
Some(data.as_bytes()[..split].to_vec())
}
+/// Truncate a UTF-8 slice and increment it's final character. The returned
value is the
+/// longest such slice that is still a valid UTF-8 string while being less
than `length`
+/// bytes and non-empty. Returns `None` if no such transformation is possible.
+///
+/// The caller guarantees that data.len() > length.
+fn truncate_and_increment_utf8(data: &str, length: usize) -> Option<Vec<u8>> {
+ // UTF-8 is max 4 bytes, so start search 3 back from desired length
+ let lower_bound = length.saturating_sub(3);
+ let split = (lower_bound..=length).rfind(|x| data.is_char_boundary(*x))?;
+ increment_utf8(data.get(..split)?)
+}
+
+/// Increment the final character in a UTF-8 string in such a way that the
returned result
+/// is still a valid UTF-8 string. The returned string may be shorter than the
input if the
+/// last character(s) cannot be incremented (due to overflow or producing
invalid code points).
+/// Returns `None` if the string cannot be incremented.
+///
+/// Note that this implementation will not promote an N-byte code point to
(N+1) bytes.
+fn increment_utf8(data: &str) -> Option<Vec<u8>> {
+ for (idx, code_point) in data.char_indices().rev() {
+ let curr_len = code_point.len_utf8();
+ let original = code_point as u32;
+ if let Some(next_char) = char::from_u32(original + 1) {
+ // do not allow increasing byte width of incremented char
Review Comment:
I think what you have here (equality) is good and easy to reason about.
Let's keep it that way
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]