Re: [PR] Directly decode String/BinaryView types from arrow-row format [arrow-rs]

via GitHub Sat, 13 Jul 2024 03:40:40 -0700


alamb commented on code in PR #6044:
URL: https://github.com/apache/arrow-rs/pull/6044#discussion_r1676803062



##########
arrow-row/src/variable.rs:
##########
@@ -243,6 +244,88 @@ pub fn decode_binary<I: OffsetSizeTrait>(
     unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
 }
 
+fn decode_binary_view_inner(
+    rows: &mut [&[u8]],
+    options: SortOptions,
+    check_utf8: bool,
+) -> BinaryViewArray {
+    let len = rows.len();
+
+    let mut null_count = 0;
+
+    let nulls = MutableBuffer::collect_bool(len, |x| {
+        let valid = rows[x][0] != null_sentinel(options);
+        null_count += !valid as usize;
+        valid
+    });
+
+    let values_capacity: usize = rows.iter().map(|row| decoded_len(row, 
options)).sum();
+    let mut values = MutableBuffer::new(values_capacity);

Review Comment:
   I think it would be valuable to add that context as a comment here as well 
   
   ```suggestion
       // Reserve the max amount of bytes possible for this buffer.
       // However, only strings longer than 12 bytes are copied here
       // This means it is less memory efficient but faster to do utf8 
validation 
       let mut values = MutableBuffer::new(values_capacity);
   ```



##########
arrow-row/src/variable.rs:
##########
@@ -243,6 +244,88 @@ pub fn decode_binary<I: OffsetSizeTrait>(
     unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
 }
 
+fn decode_binary_view_inner(
+    rows: &mut [&[u8]],
+    options: SortOptions,
+    check_utf8: bool,
+) -> BinaryViewArray {
+    let len = rows.len();
+
+    let mut null_count = 0;
+
+    let nulls = MutableBuffer::collect_bool(len, |x| {
+        let valid = rows[x][0] != null_sentinel(options);
+        null_count += !valid as usize;
+        valid
+    });
+
+    let values_capacity: usize = rows.iter().map(|row| decoded_len(row, 
options)).sum();
+    let mut values = MutableBuffer::new(values_capacity);
+    let mut views = BufferBuilder::<u128>::new(len);
+
+    for row in rows {
+        let start_offset = values.len();
+        let offset = decode_blocks(row, options, |b| 
values.extend_from_slice(b));

Review Comment:
   I wonder if we could do something similar to the parquet decoder (decode the 
strings into 2 different buffers, one for utf8 short string validation that was 
thrown away after conversion).



##########
arrow-row/src/variable.rs:
##########
@@ -243,6 +244,88 @@ pub fn decode_binary<I: OffsetSizeTrait>(
     unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
 }
 
+fn decode_binary_view_inner(
+    rows: &mut [&[u8]],
+    options: SortOptions,
+    check_utf8: bool,
+) -> BinaryViewArray {
+    let len = rows.len();
+
+    let mut null_count = 0;
+
+    let nulls = MutableBuffer::collect_bool(len, |x| {
+        let valid = rows[x][0] != null_sentinel(options);
+        null_count += !valid as usize;
+        valid
+    });
+
+    let values_capacity: usize = rows.iter().map(|row| decoded_len(row, 
options)).sum();
+    let mut values = MutableBuffer::new(values_capacity);
+    let mut views = BufferBuilder::<u128>::new(len);
+
+    for row in rows {
+        let start_offset = values.len();
+        let offset = decode_blocks(row, options, |b| 
values.extend_from_slice(b));
+        if row[0] == null_sentinel(options) {
+            debug_assert_eq!(offset, 1);
+            debug_assert_eq!(start_offset, values.len());
+            views.append(0);
+        } else {
+            let view = make_view(
+                unsafe { values.get_unchecked(start_offset..) },
+                0,
+                start_offset as u32,
+            );
+            views.append(view);
+        }
+        *row = &row[offset..];
+    }
+
+    if options.descending {
+        values.as_slice_mut().iter_mut().for_each(|o| *o = !*o);
+        for view in views.as_slice_mut() {
+            let len = *view as u32;
+            if len <= 12 {
+                let mut bytes = view.to_le_bytes();
+                bytes
+                    .iter_mut()
+                    .skip(4)
+                    .take(len as usize)
+                    .for_each(|o| *o = !*o);
+                *view = u128::from_le_bytes(bytes);
+            } else {
+                let mut byte_view = ByteView::from(*view);
+                let mut prefix = byte_view.prefix.to_le_bytes();
+                prefix.iter_mut().for_each(|o| *o = !*o);
+                byte_view.prefix = u32::from_le_bytes(prefix);
+                *view = byte_view.into();
+            }
+        }
+    }
+
+    if check_utf8 {
+        // the values contains all data, no matter if it is short or long
+        // we can validate utf8 in one go.
+        std::str::from_utf8(values.as_slice()).unwrap();

Review Comment:
   I looked at it seems like the `StringArray` code does not do a UTF8 check. 
If StringArray doesn't need to re-validate utf8 I don't think `StringViewArray` 
does either.
   
   The argument would be if the Rows came from a valid RowEncoder, any data 
that was encoded to `Rows` was valid Utf-8 the data that comes out must be too
   
   If there is some way to construct `Rows` / `Row` from raw bytes we probably 
could not make this assumption. However the APIs do not seem to allow this 
https://docs.rs/arrow-row/52.1.0/arrow_row/struct.Row.html



##########
arrow-row/src/variable.rs:
##########
@@ -243,6 +244,88 @@ pub fn decode_binary<I: OffsetSizeTrait>(
     unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
 }
 
+fn decode_binary_view_inner(
+    rows: &mut [&[u8]],
+    options: SortOptions,
+    check_utf8: bool,
+) -> BinaryViewArray {
+    let len = rows.len();
+
+    let mut null_count = 0;
+
+    let nulls = MutableBuffer::collect_bool(len, |x| {
+        let valid = rows[x][0] != null_sentinel(options);
+        null_count += !valid as usize;
+        valid
+    });
+
+    let values_capacity: usize = rows.iter().map(|row| decoded_len(row, 
options)).sum();
+    let mut values = MutableBuffer::new(values_capacity);
+    let mut views = BufferBuilder::<u128>::new(len);
+
+    for row in rows {
+        let start_offset = values.len();
+        let offset = decode_blocks(row, options, |b| 
values.extend_from_slice(b));
+        if row[0] == null_sentinel(options) {
+            debug_assert_eq!(offset, 1);
+            debug_assert_eq!(start_offset, values.len());
+            views.append(0);
+        } else {
+            let view = make_view(
+                unsafe { values.get_unchecked(start_offset..) },

Review Comment:
   Maybe in the spirit of https://github.com/apache/arrow-rs/pull/6025 from 
@veluca93  we can justify this use of unsafe -- in this case I think it is that 
`decode_blocks` returns the correct length for the values it decoded 



##########
arrow-row/src/variable.rs:
##########
@@ -243,6 +244,88 @@ pub fn decode_binary<I: OffsetSizeTrait>(
     unsafe { GenericBinaryArray::from(builder.build_unchecked()) }
 }
 
+fn decode_binary_view_inner(
+    rows: &mut [&[u8]],
+    options: SortOptions,
+    check_utf8: bool,
+) -> BinaryViewArray {
+    let len = rows.len();
+
+    let mut null_count = 0;
+
+    let nulls = MutableBuffer::collect_bool(len, |x| {
+        let valid = rows[x][0] != null_sentinel(options);
+        null_count += !valid as usize;
+        valid
+    });
+
+    let values_capacity: usize = rows.iter().map(|row| decoded_len(row, 
options)).sum();
+    let mut values = MutableBuffer::new(values_capacity);
+    let mut views = BufferBuilder::<u128>::new(len);
+
+    for row in rows {
+        let start_offset = values.len();
+        let offset = decode_blocks(row, options, |b| 
values.extend_from_slice(b));
+        if row[0] == null_sentinel(options) {
+            debug_assert_eq!(offset, 1);
+            debug_assert_eq!(start_offset, values.len());
+            views.append(0);
+        } else {
+            let view = make_view(
+                unsafe { values.get_unchecked(start_offset..) },
+                0,
+                start_offset as u32,
+            );
+            views.append(view);
+        }
+        *row = &row[offset..];
+    }
+
+    if options.descending {
+        values.as_slice_mut().iter_mut().for_each(|o| *o = !*o);
+        for view in views.as_slice_mut() {

Review Comment:
   It took me a while to realize this code was flipping the bits on the inlined 
portion of the view data.
   
   I think at least we should add a comment to this effect. 
   
   Also I wonder if it would be faster (and easier to understand) if the bit 
twiddling was done while the views were constructed (so they didn't have to 
modify the same bytes in the inlined portion and the values buffer portion)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Directly decode String/BinaryView types from arrow-row format [arrow-rs]

Reply via email to