andygrove commented on code in PR #558: URL: https://github.com/apache/datafusion-comet/pull/558#discussion_r1638249606
########## core/src/parquet/read/values.rs: ########## @@ -438,41 +419,92 @@ impl PlainDictDecoding for BoolType { } } -// Shared implementation for int variants such as Int8 and Int16 -macro_rules! make_int_variant_impl { - ($ty: ident, $native_ty: ty, $type_size: expr) => { - impl PlainDecoding for $ty { +macro_rules! impl_plain_decoding_int { + ($dst_type:ty, $copy_fn:ident, $type_width:expr) => { + impl PlainDecoding for $dst_type { fn decode(src: &mut PlainDecoderInner, dst: &mut ParquetMutableVector, num: usize) { - let src_data = &src.data; let dst_slice = dst.value_buffer.as_slice_mut(); - let mut dst_offset = dst.num_values * $type_size; - for _ in 0..num { - unsafe { - copy_nonoverlapping( - &src_data[src.offset..] as *const [u8] as *const u8 - as *const $native_ty, - &mut dst_slice[dst_offset] as *mut u8 as *mut $native_ty, - 1, - ); - } - src.offset += 4; // Parquet stores Int8/Int16 using 4 bytes - dst_offset += $type_size; - } + let dst_offset = dst.num_values * $type_width; + $copy_fn(&src.data[src.offset..], &mut dst_slice[dst_offset..], num); + src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes } fn skip(src: &mut PlainDecoderInner, num: usize) { - let num_bytes = 4 * num; // Parquet stores Int8/Int16 using 4 bytes - src.offset += num_bytes; + src.offset += 4 * num; // Parquet stores Int8/Int16 using 4 bytes + } + } + }; +} + +impl_plain_decoding_int!(Int8Type, copy_i32_to_i8, 1); +impl_plain_decoding_int!(Int16Type, copy_i32_to_i16, 2); +impl_plain_decoding_int!(Int32To64Type, copy_i32_to_i64, 4); + +// unsigned type require double the width and zeroes are written for the second half +// perhaps because they are implemented as the next size up signed type? +impl_plain_decoding_int!(UInt8Type, copy_i32_to_u8, 2); Review Comment: This confused me as well, but I am using the same type widths as the original code: ``` make_int_variant_impl!(Int8Type, i8, 1); make_int_variant_impl!(UInt8Type, u8, 2); make_int_variant_impl!(Int16Type, i16, 2); make_int_variant_impl!(UInt16Type, u16, 4); make_int_variant_impl!(UInt32Type, u32, 8); ``` This seems to be specific to the way Parquet represents unsigned types. I think Parquet only supports signed types so u8 becomes i16, u16 becomes i32, and so on -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org