Re: [PR] fix: Re-implement some Parquet decode methods without `copy_nonoverlapping` [datafusion-comet]

via GitHub Thu, 13 Jun 2024 06:45:46 -0700


andygrove commented on code in PR #558:
URL: https://github.com/apache/datafusion-comet/pull/558#discussion_r1638249606



##########
core/src/parquet/read/values.rs:
##########
@@ -438,41 +419,92 @@ impl PlainDictDecoding for BoolType {
     }
 }
 
-// Shared implementation for int variants such as Int8 and Int16
-macro_rules! make_int_variant_impl {
-    ($ty: ident, $native_ty: ty, $type_size: expr) => {
-        impl PlainDecoding for $ty {
+macro_rules! impl_plain_decoding_int {
+    ($dst_type:ty, $copy_fn:ident, $type_width:expr) => {
+        impl PlainDecoding for $dst_type {
             fn decode(src: &mut PlainDecoderInner, dst: &mut 
ParquetMutableVector, num: usize) {
-                let src_data = &src.data;
                 let dst_slice = dst.value_buffer.as_slice_mut();
-                let mut dst_offset = dst.num_values * $type_size;
-                for _ in 0..num {
-                    unsafe {
-                        copy_nonoverlapping(
-                            &src_data[src.offset..] as *const [u8] as *const u8
-                                as *const $native_ty,
-                            &mut dst_slice[dst_offset] as *mut u8 as *mut 
$native_ty,
-                            1,
-                        );
-                    }
-                    src.offset += 4; // Parquet stores Int8/Int16 using 4 bytes
-                    dst_offset += $type_size;
-                }
+                let dst_offset = dst.num_values * $type_width;
+                $copy_fn(&src.data[src.offset..], &mut 
dst_slice[dst_offset..], num);
+                src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 
bytes
             }
 
             fn skip(src: &mut PlainDecoderInner, num: usize) {
-                let num_bytes = 4 * num; // Parquet stores Int8/Int16 using 4 
bytes
-                src.offset += num_bytes;
+                src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 
bytes
+            }
+        }
+    };
+}
+
+impl_plain_decoding_int!(Int8Type, copy_i32_to_i8, 1);
+impl_plain_decoding_int!(Int16Type, copy_i32_to_i16, 2);
+impl_plain_decoding_int!(Int32To64Type, copy_i32_to_i64, 4);
+
+// unsigned type require double the width and zeroes are written for the 
second half
+// perhaps because they are implemented as the next size up signed type?
+impl_plain_decoding_int!(UInt8Type, copy_i32_to_u8, 2);

Review Comment:
   This confused me as well, but I am using the same type widths as the 
original code:
   
   ```
   make_int_variant_impl!(Int8Type, i8, 1);
   make_int_variant_impl!(UInt8Type, u8, 2);
   make_int_variant_impl!(Int16Type, i16, 2);
   make_int_variant_impl!(UInt16Type, u16, 4);
   make_int_variant_impl!(UInt32Type, u32, 8);
   ```
   
   This seems to be specific to the way Parquet represents unsigned types. I 
think Parquet only supports signed types so u8 becomes i16, u16 becomes i32, 
and so on



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] fix: Re-implement some Parquet decode methods without `copy_nonoverlapping` [datafusion-comet]

Reply via email to