alamb commented on code in PR #2278: URL: https://github.com/apache/arrow-rs/pull/2278#discussion_r935875334
########## parquet/src/util/bit_util.rs: ########## @@ -497,46 +487,104 @@ impl BitReader { } } - let in_buf = &self.buffer.data()[self.byte_offset..]; - let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32; - if size_of::<T>() == 4 { - while values_to_read - i >= 32 { - let out_ptr = &mut batch[i..] as *mut [T] as *mut T as *mut u32; - in_ptr = unsafe { unpack32(in_ptr, out_ptr, num_bits) }; - self.byte_offset += 4 * num_bits; - i += 32; + let in_buf = self.buffer.data(); + + // Read directly into output buffer + match size_of::<T>() { + 1 => { + let ptr = batch.as_mut_ptr() as *mut u8; + let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; + while values_to_read - i >= 8 { + let out_slice = (&mut out[i..i + 8]).try_into().unwrap(); + unpack8(&in_buf[self.byte_offset..], out_slice, num_bits); + self.byte_offset += num_bits; + i += 8; + } } - } else { - let mut out_buf = [0u32; 32]; - let out_ptr = &mut out_buf as &mut [u32] as *mut [u32] as *mut u32; - while values_to_read - i >= 32 { - in_ptr = unsafe { unpack32(in_ptr, out_ptr, num_bits) }; - self.byte_offset += 4 * num_bits; - - for out in out_buf { - // Zero-allocate buffer - let mut out_bytes = T::Buffer::default(); - let in_bytes = out.to_le_bytes(); - - { - let out_bytes = out_bytes.as_mut(); - let len = out_bytes.len().min(in_bytes.len()); - (&mut out_bytes[..len]).copy_from_slice(&in_bytes[..len]); - } - - batch[i] = T::from_le_bytes(out_bytes); - i += 1; + 2 => { + let ptr = batch.as_mut_ptr() as *mut u16; + let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; + while values_to_read - i >= 16 { + let out_slice = (&mut out[i..i + 16]).try_into().unwrap(); + unpack16(&in_buf[self.byte_offset..], out_slice, num_bits); + self.byte_offset += 2 * num_bits; + i += 16; } } + 4 => { + let ptr = batch.as_mut_ptr() as *mut u32; + let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; + while values_to_read - i >= 32 { + let out_slice = (&mut out[i..i + 32]).try_into().unwrap(); + unpack32(&in_buf[self.byte_offset..], out_slice, num_bits); + self.byte_offset += 4 * num_bits; + i += 32; + } + } + 8 => { + let ptr = batch.as_mut_ptr() as *mut u64; + let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; + while values_to_read - i >= 64 { + let out_slice = (&mut out[i..i + 64]).try_into().unwrap(); + unpack64(&in_buf[self.byte_offset..], out_slice, num_bits); + self.byte_offset += 8 * num_bits; + i += 64; + } + } + _ => unreachable!(), + } + + // Try to read smaller batches if possible Review Comment: ```suggestion // Try to read remainder, if any, in batches if possible ``` ########## parquet/src/util/bit_util.rs: ########## @@ -476,17 +477,6 @@ impl BitReader { let mut i = 0; Review Comment: it isn't clear to me what the `num_bits` input parameter means (is it the number of bits to read after `batch.len()` whole `T` values?) -- can you possibly update the comments too? ########## parquet/src/util/bit_pack.rs: ########## @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Vectorised bit-packing utilities + +macro_rules! unroll_impl { + (1, $offset:expr, $v:ident, $c:block) => {{ + const $v: usize = $offset; + $c + }}; + (2, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(1, $offset, $v, $c); + unroll_impl!(1, $offset + 1, $v, $c); + }}; + (4, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v4, { unroll_impl!(2, __v4 * 2 + $offset, $v, $c) }); + }}; + (8, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v8, { unroll_impl!(4, __v8 * 4 + $offset, $v, $c) }); + }}; + (16, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(4, 0, __v16, { + unroll_impl!(4, __v16 * 4 + $offset, $v, $c) + }); + }}; + (32, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v32, { + unroll_impl!(16, __v32 * 16 + $offset, $v, $c) + }); + }}; + (64, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v64, { + unroll_impl!(32, __v64 * 32 + $offset, $v, $c) + }); + }}; +} + +/// Manually unrolls a loop body, this is useful for two reasons +/// +/// - force unrolling of a loop so that LLVM optimises it properly +/// - call a const generic function with the loop value +macro_rules! unroll { + (for $v:ident in 0..$end:tt $c:block) => { + #[allow(non_upper_case_globals)] + { + unroll_impl!($end, 0, $v, $c) + } + }; +} + +/// Macro that generates an unpack function taking the number of bits as a const generic +macro_rules! unpack_impl { + ($t:ty, $bytes:literal, $bits:tt) => { + pub fn unpack<const NUM_BITS: usize>(input: &[u8], output: &mut [$t; $bits]) { + if NUM_BITS == 0 { + for out in output { + *out = 0; + } + return; + } + + assert!(NUM_BITS <= $bytes * 8); + + let mask = match NUM_BITS { + $bits => <$t>::MAX, + _ => ((1 << NUM_BITS) - 1), + }; + + assert!(input.len() >= NUM_BITS * $bytes); + + let r = |output_idx: usize| { + <$t>::from_le_bytes( + input[output_idx * $bytes..output_idx * $bytes + $bytes] + .try_into() + .unwrap(), + ) + }; + + unroll!(for i in 0..$bits { + let start_bit = i * NUM_BITS; + let end_bit = start_bit + NUM_BITS; + + let start_bit_offset = start_bit % $bits; + let end_bit_offset = end_bit % $bits; + let start_byte = start_bit / $bits; + let end_byte = end_bit / $bits; + if start_byte != end_byte && end_bit_offset != 0 { + let val = r(start_byte); + let a = val >> start_bit_offset; + let val = r(end_byte); + let b = val << (NUM_BITS - end_bit_offset); + + output[i] = a | (b & mask); + } else { + let val = r(start_byte); + output[i] = (val >> start_bit_offset) & mask; + } + }); + } + }; +} + +/// Macro that generates unpack functions that accept num_bits as a parameter +macro_rules! unpack { + ($name:ident, $t:ty, $bytes:literal, $bits:tt) => { + mod $name { + unpack_impl!($t, $bytes, $bits); + } + + /// Unpack packed `input` into `output` with a bit width of `num_bits` + pub fn $name(input: &[u8], output: &mut [$t; $bits], num_bits: usize) { + // This will get optimised into a jump table + unroll!(for i in 0..$bits { + if i == num_bits { + return $name::unpack::<i>(input, output); + } + }); + if num_bits == $bits { + return $name::unpack::<$bits>(input, output); + } + unreachable!("invalid num_bits {}", num_bits); + } + }; +} + +unpack!(unpack8, u8, 1, 8); +unpack!(unpack16, u16, 2, 16); +unpack!(unpack32, u32, 4, 32); +unpack!(unpack64, u64, 8, 64); + +#[cfg(test)] +mod tests { + use super::*; + use rand::{thread_rng, Rng}; + + #[inline(never)] Review Comment: Why this annotation? ########## parquet/src/util/bit_util.rs: ########## @@ -1014,11 +1062,12 @@ mod tests { fn test_get_batch() { const SIZE: &[usize] = &[1, 31, 32, 33, 128, 129]; for s in SIZE { - for i in 0..33 { + for i in 0..=64 { match i { 0..=8 => test_get_batch_helper::<u8>(*s, i), 9..=16 => test_get_batch_helper::<u16>(*s, i), - _ => test_get_batch_helper::<u32>(*s, i), + 17..=32 => test_get_batch_helper::<u32>(*s, i), Review Comment: 👍 ########## parquet/src/util/bit_pack.rs: ########## @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Vectorised bit-packing utilities + +macro_rules! unroll_impl { + (1, $offset:expr, $v:ident, $c:block) => {{ + const $v: usize = $offset; + $c + }}; + (2, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(1, $offset, $v, $c); + unroll_impl!(1, $offset + 1, $v, $c); + }}; + (4, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v4, { unroll_impl!(2, __v4 * 2 + $offset, $v, $c) }); + }}; + (8, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v8, { unroll_impl!(4, __v8 * 4 + $offset, $v, $c) }); + }}; + (16, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(4, 0, __v16, { + unroll_impl!(4, __v16 * 4 + $offset, $v, $c) + }); + }}; + (32, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v32, { + unroll_impl!(16, __v32 * 16 + $offset, $v, $c) + }); + }}; + (64, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v64, { + unroll_impl!(32, __v64 * 32 + $offset, $v, $c) + }); + }}; +} + +/// Manually unrolls a loop body, this is useful for two reasons +/// +/// - force unrolling of a loop so that LLVM optimises it properly +/// - call a const generic function with the loop value +macro_rules! unroll { Review Comment: An example might help to document this -- btw the use of matching patterns for unrolling `i in 0..end` is quite neat ########## parquet/src/util/bit_pack.rs: ########## @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Vectorised bit-packing utilities + +macro_rules! unroll_impl { + (1, $offset:expr, $v:ident, $c:block) => {{ + const $v: usize = $offset; + $c + }}; + (2, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(1, $offset, $v, $c); + unroll_impl!(1, $offset + 1, $v, $c); + }}; + (4, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v4, { unroll_impl!(2, __v4 * 2 + $offset, $v, $c) }); + }}; + (8, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v8, { unroll_impl!(4, __v8 * 4 + $offset, $v, $c) }); + }}; + (16, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(4, 0, __v16, { + unroll_impl!(4, __v16 * 4 + $offset, $v, $c) + }); + }}; + (32, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v32, { + unroll_impl!(16, __v32 * 16 + $offset, $v, $c) + }); + }}; + (64, $offset:expr, $v:ident, $c:block) => {{ + unroll_impl!(2, 0, __v64, { + unroll_impl!(32, __v64 * 32 + $offset, $v, $c) + }); + }}; +} + +/// Manually unrolls a loop body, this is useful for two reasons +/// +/// - force unrolling of a loop so that LLVM optimises it properly +/// - call a const generic function with the loop value +macro_rules! unroll { + (for $v:ident in 0..$end:tt $c:block) => { + #[allow(non_upper_case_globals)] + { + unroll_impl!($end, 0, $v, $c) + } + }; +} + +/// Macro that generates an unpack function taking the number of bits as a const generic +macro_rules! unpack_impl { + ($t:ty, $bytes:literal, $bits:tt) => { + pub fn unpack<const NUM_BITS: usize>(input: &[u8], output: &mut [$t; $bits]) { + if NUM_BITS == 0 { + for out in output { + *out = 0; + } + return; + } + + assert!(NUM_BITS <= $bytes * 8); + + let mask = match NUM_BITS { + $bits => <$t>::MAX, + _ => ((1 << NUM_BITS) - 1), + }; + + assert!(input.len() >= NUM_BITS * $bytes); + + let r = |output_idx: usize| { + <$t>::from_le_bytes( + input[output_idx * $bytes..output_idx * $bytes + $bytes] + .try_into() + .unwrap(), + ) + }; + + unroll!(for i in 0..$bits { + let start_bit = i * NUM_BITS; + let end_bit = start_bit + NUM_BITS; + + let start_bit_offset = start_bit % $bits; + let end_bit_offset = end_bit % $bits; + let start_byte = start_bit / $bits; + let end_byte = end_bit / $bits; + if start_byte != end_byte && end_bit_offset != 0 { + let val = r(start_byte); + let a = val >> start_bit_offset; + let val = r(end_byte); + let b = val << (NUM_BITS - end_bit_offset); + + output[i] = a | (b & mask); Review Comment: I wonder if we could somehow remove the `output[i]` bounds check too? Perhaps with an iterator or append or something 🤔 ########## parquet/src/util/bit_pack.rs: ########## @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Vectorised bit-packing utilities + +macro_rules! unroll_impl { Review Comment: Can we get some comments in this macro so that future readers can have a hope of understanding what it does without hours of study ;) Starting with inlining the comments from the PR is probably a good start :) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org