xinlifoobar commented on code in PR #6231:
URL: https://github.com/apache/arrow-rs/pull/6231#discussion_r1722724530
##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -667,6 +699,58 @@ impl StringViewArray {
None => true,
})
}
+
+ /// Returns an iterator over the prefix bytes of this array with respect
to the prefix length.
+ /// If the prefix length is larger than the string length, it will return
the empty string.
+ pub fn prefix_iter(&self, prefix_len: usize) -> impl Iterator<Item = &str>
{
+ self.views().into_iter().map(move |v| {
+ let len = (*v as u32) as usize;
+
+ if len < prefix_len {
+ return "";
+ }
+
+ let b = if prefix_len <= 4 || len <= 12 {
+ unsafe { StringViewArray::inline_value(v, prefix_len) }
+ } else {
+ let view = ByteView::from(*v);
+ let data = unsafe {
+ self.data_buffers()
+ .get_unchecked(view.buffer_index as usize)
+ };
+ let offset = view.offset as usize;
+ unsafe { data.get_unchecked(offset..offset + prefix_len) }
+ };
+
+ unsafe { str::from_utf8_unchecked(b) }
+ })
+ }
+
+ /// Returns an iterator over the suffix bytes of this array with respect
to the suffix length.
+ /// If the suffix length is larger than the string length, it will return
the empty string.
+ pub fn suffix_iter(&self, suffix_len: usize) -> impl Iterator<Item = &str>
{
+ self.views().into_iter().map(move |v| {
+ let len = (*v as u32) as usize;
+
+ if len < suffix_len {
+ return "";
+ }
+
+ let b = if len <= 12 {
+ unsafe { &StringViewArray::inline_value(v, len)[len -
suffix_len..] }
+ } else {
+ let view = ByteView::from(*v);
+ let data = unsafe {
+ self.data_buffers()
+ .get_unchecked(view.buffer_index as usize)
+ };
+ let offset = view.offset as usize;
+ unsafe { data.get_unchecked(offset + len - suffix_len..offset
+ len) }
Review Comment:
Changed this to return bytes to avoid the bad utf8 issue, for prefix and
suffix this PR will do bytes-wise comparisons instead of string/char-wise.
Also, I added a test case for multi-bytes unicodes. The `str.len()` method
will return the number of underlying bytes instead of `chars` so we should be
safe.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]