This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 8a092e38b Add unpack8, unpack16, unpack64 (#2276) ~10-50% faster 
(#2278)
8a092e38b is described below

commit 8a092e38b82dd7ef24c60700d002a6a421b66802
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Aug 3 21:45:34 2022 +0100

    Add unpack8, unpack16, unpack64 (#2276) ~10-50% faster (#2278)
    
    * Add unpack8, unpack16, unpack64 (#2276)
    
    * Add zero-extend fallback
    
    * Fix copy-elision
    
    * Switch to using seq_macro
    
    * Remove unused function
    
    * Update docs
---
 parquet/Cargo.toml              |    1 +
 parquet/src/util/bit_pack.rs    |  138 ++
 parquet/src/util/bit_packing.rs | 3662 ---------------------------------------
 parquet/src/util/bit_util.rs    |  162 +-
 parquet/src/util/mod.rs         |    2 +-
 5 files changed, 250 insertions(+), 3715 deletions(-)

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 671f232cf..0324ecccc 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -47,6 +47,7 @@ arrow = { path = "../arrow", version = "19.0.0", optional = 
true, default-featur
 base64 = { version = "0.13", default-features = false, features = ["std"], 
optional = true }
 clap = { version = "3", default-features = false, features = ["std", "derive", 
"env"], optional = true }
 serde_json = { version = "1.0", default-features = false, features = ["std"], 
optional = true }
+seq-macro = { version = "0.3", default-features = false }
 rand = { version = "0.8", default-features = false, features = ["std", 
"std_rng"] }
 futures = { version = "0.3", default-features = false, features = ["std"], 
optional = true }
 tokio = { version = "1.0", optional = true, default-features = false, features 
= ["macros", "fs", "rt", "io-util"] }
diff --git a/parquet/src/util/bit_pack.rs b/parquet/src/util/bit_pack.rs
new file mode 100644
index 000000000..b268aa567
--- /dev/null
+++ b/parquet/src/util/bit_pack.rs
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Vectorised bit-packing utilities
+
+/// Macro that generates an unpack function taking the number of bits as a 
const generic
+macro_rules! unpack_impl {
+    ($t:ty, $bytes:literal, $bits:tt) => {
+        pub fn unpack<const NUM_BITS: usize>(input: &[u8], output: &mut [$t; 
$bits]) {
+            if NUM_BITS == 0 {
+                for out in output {
+                    *out = 0;
+                }
+                return;
+            }
+
+            assert!(NUM_BITS <= $bytes * 8);
+
+            let mask = match NUM_BITS {
+                $bits => <$t>::MAX,
+                _ => ((1 << NUM_BITS) - 1),
+            };
+
+            assert!(input.len() >= NUM_BITS * $bytes);
+
+            let r = |output_idx: usize| {
+                <$t>::from_le_bytes(
+                    input[output_idx * $bytes..output_idx * $bytes + $bytes]
+                        .try_into()
+                        .unwrap(),
+                )
+            };
+
+            seq_macro::seq!(i in 0..$bits {
+                let start_bit = i * NUM_BITS;
+                let end_bit = start_bit + NUM_BITS;
+
+                let start_bit_offset = start_bit % $bits;
+                let end_bit_offset = end_bit % $bits;
+                let start_byte = start_bit / $bits;
+                let end_byte = end_bit / $bits;
+                if start_byte != end_byte && end_bit_offset != 0 {
+                    let val = r(start_byte);
+                    let a = val >> start_bit_offset;
+                    let val = r(end_byte);
+                    let b = val << (NUM_BITS - end_bit_offset);
+
+                    output[i] = a | (b & mask);
+                } else {
+                    let val = r(start_byte);
+                    output[i] = (val >> start_bit_offset) & mask;
+                }
+            });
+        }
+    };
+}
+
+/// Macro that generates unpack functions that accept num_bits as a parameter
+macro_rules! unpack {
+    ($name:ident, $t:ty, $bytes:literal, $bits:tt) => {
+        mod $name {
+            unpack_impl!($t, $bytes, $bits);
+        }
+
+        /// Unpack packed `input` into `output` with a bit width of `num_bits`
+        pub fn $name(input: &[u8], output: &mut [$t; $bits], num_bits: usize) {
+            // This will get optimised into a jump table
+            seq_macro::seq!(i in 0..=$bits {
+                if i == num_bits {
+                    return $name::unpack::<i>(input, output);
+                }
+            });
+            unreachable!("invalid num_bits {}", num_bits);
+        }
+    };
+}
+
+unpack!(unpack8, u8, 1, 8);
+unpack!(unpack16, u16, 2, 16);
+unpack!(unpack32, u32, 4, 32);
+unpack!(unpack64, u64, 8, 64);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::{thread_rng, Rng};
+
+    #[test]
+    fn test_basic() {
+        let input = [0xFF; 4096];
+
+        for i in 0..=8 {
+            let mut output = [0; 8];
+            unpack8(&input, &mut output, i);
+            for (idx, out) in output.iter().enumerate() {
+                assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", 
idx, out);
+            }
+        }
+
+        for i in 0..=16 {
+            let mut output = [0; 16];
+            unpack16(&input, &mut output, i);
+            for (idx, out) in output.iter().enumerate() {
+                assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", 
idx, out);
+            }
+        }
+
+        for i in 0..=32 {
+            let mut output = [0; 32];
+            unpack32(&input, &mut output, i);
+            for (idx, out) in output.iter().enumerate() {
+                assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", 
idx, out);
+            }
+        }
+
+        for i in 0..=64 {
+            let mut output = [0; 64];
+            unpack64(&input, &mut output, i);
+            for (idx, out) in output.iter().enumerate() {
+                assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", 
idx, out);
+            }
+        }
+    }
+}
diff --git a/parquet/src/util/bit_packing.rs b/parquet/src/util/bit_packing.rs
deleted file mode 100644
index 758992ab2..000000000
--- a/parquet/src/util/bit_packing.rs
+++ /dev/null
@@ -1,3662 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-/// Unpack 32 values with bit width `num_bits` from `in_ptr`, and write to 
`out_ptr`.
-/// Return the `in_ptr` where the starting offset points to the first byte 
after all the
-/// bytes that were consumed.
-// TODO: may be better to make these more compact using if-else conditions.
-//  However, this may require const generics:
-//     https://github.com/rust-lang/rust/issues/44580
-//  to eliminate the branching cost.
-// TODO: we should use SIMD instructions to further optimize this. I have 
explored
-//    https://github.com/tantivy-search/bitpacking
-// but the layout it uses for SIMD is different from Parquet.
-// TODO: support packing as well, which is used for encoding.
-pub unsafe fn unpack32(
-    mut in_ptr: *const u32,
-    out_ptr: *mut u32,
-    num_bits: usize,
-) -> *const u32 {
-    in_ptr = match num_bits {
-        0 => nullunpacker32(in_ptr, out_ptr),
-        1 => unpack1_32(in_ptr, out_ptr),
-        2 => unpack2_32(in_ptr, out_ptr),
-        3 => unpack3_32(in_ptr, out_ptr),
-        4 => unpack4_32(in_ptr, out_ptr),
-        5 => unpack5_32(in_ptr, out_ptr),
-        6 => unpack6_32(in_ptr, out_ptr),
-        7 => unpack7_32(in_ptr, out_ptr),
-        8 => unpack8_32(in_ptr, out_ptr),
-        9 => unpack9_32(in_ptr, out_ptr),
-        10 => unpack10_32(in_ptr, out_ptr),
-        11 => unpack11_32(in_ptr, out_ptr),
-        12 => unpack12_32(in_ptr, out_ptr),
-        13 => unpack13_32(in_ptr, out_ptr),
-        14 => unpack14_32(in_ptr, out_ptr),
-        15 => unpack15_32(in_ptr, out_ptr),
-        16 => unpack16_32(in_ptr, out_ptr),
-        17 => unpack17_32(in_ptr, out_ptr),
-        18 => unpack18_32(in_ptr, out_ptr),
-        19 => unpack19_32(in_ptr, out_ptr),
-        20 => unpack20_32(in_ptr, out_ptr),
-        21 => unpack21_32(in_ptr, out_ptr),
-        22 => unpack22_32(in_ptr, out_ptr),
-        23 => unpack23_32(in_ptr, out_ptr),
-        24 => unpack24_32(in_ptr, out_ptr),
-        25 => unpack25_32(in_ptr, out_ptr),
-        26 => unpack26_32(in_ptr, out_ptr),
-        27 => unpack27_32(in_ptr, out_ptr),
-        28 => unpack28_32(in_ptr, out_ptr),
-        29 => unpack29_32(in_ptr, out_ptr),
-        30 => unpack30_32(in_ptr, out_ptr),
-        31 => unpack31_32(in_ptr, out_ptr),
-        32 => unpack32_32(in_ptr, out_ptr),
-        _ => unimplemented!(),
-    };
-    in_ptr
-}
-
-unsafe fn nullunpacker32(in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    for _ in 0..32 {
-        *out = 0;
-        out = out.offset(1);
-    }
-    in_buf
-}
-
-unsafe fn unpack1_32(in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 1) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 2) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 3) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 4) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 5) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 6) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 7) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 9) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 11) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 13) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 15) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 17) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 19) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 21) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 23) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 25) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 26) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 27) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 29) & 1;
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 30) & 1;
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack2_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 26) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 2);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-    *out = (in_buf.read_unaligned()) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 26) % (1u32 << 2);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 2);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack3_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 21) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 27) % (1u32 << 3);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (3 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 25) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 3);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (3 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 23) % (1u32 << 3);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 26) % (1u32 << 3);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack4_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 4);
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 4);
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 4);
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 4);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 4);
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack5_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 25) % (1u32 << 5);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (5 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 23) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 5);
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (5 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 21) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 26) % (1u32 << 5);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (5 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 5);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (5 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 5);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 5);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack6_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 6);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (6 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 6);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (6 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 6);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 6);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (6 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 6);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (6 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 6);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 6);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack7_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 21) % (1u32 << 7);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (7 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 7);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (7 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 7);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (7 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 23) % (1u32 << 7);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (7 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 7);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (7 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 7);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (7 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 7);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 7);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack8_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack9_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 {
-    *out = (in_buf.read_unaligned()) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (9 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (9 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (9 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 21) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (9 - 7);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (9 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (9 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (9 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (9 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 9);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 9);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 23;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack10_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (10 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (10 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (10 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (10 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (10 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (10 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (10 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (10 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 10);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 10);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack11_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (11 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (11 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (11 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (11 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (11 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (11 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (11 - 7);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (11 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (11 - 9);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 11);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (11 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 11);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 21;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack12_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 12);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (12 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 12);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (12 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 12);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (12 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 12);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (12 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 12);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (12 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 12);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (12 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 12);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (12 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 12);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (12 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 12);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack13_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 13);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (13 - 7);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (13 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 13);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (13 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (13 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 13);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (13 - 9);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (13 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 13);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (13 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (13 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 13);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (13 - 11);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (13 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 13);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (13 - 12);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (13 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 13);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 19;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack14_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 14);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (14 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (14 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (14 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 14);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (14 - 12);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (14 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (14 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 14);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (14 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (14 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (14 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 14);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (14 - 12);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (14 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (14 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 14);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 18;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack15_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 15);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (15 - 13);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (15 - 11);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (15 - 9);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (15 - 7);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (15 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (15 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (15 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 15);
-    out = out.offset(1);
-    *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (15 - 14);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (15 - 12);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (15 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (15 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (15 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (15 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (15 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 15);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 17;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack16_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-    out = out.offset(1);
-    in_buf = in_buf.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 16);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 16;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack17_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 17;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (17 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (17 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (17 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (17 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (17 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (17 - 12);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (17 - 14);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (17 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (17 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (17 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (17 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (17 - 7);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (17 - 9);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (17 - 11);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (17 - 13);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 17);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (17 - 15);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 15;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack18_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (18 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (18 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (18 - 12);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (18 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (18 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (18 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (18 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (18 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (18 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (18 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (18 - 12);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (18 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (18 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (18 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (18 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 18);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (18 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack19_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (19 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (19 - 12);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (19 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (19 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (19 - 11);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (19 - 17);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 17;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (19 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (19 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (19 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (19 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (19 - 9);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (19 - 15);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 15;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (19 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (19 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (19 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (19 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (19 - 7);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 19);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (19 - 13);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 13;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack20_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (20 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (20 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (20 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (20 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (20 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (20 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (20 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (20 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (20 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (20 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (20 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (20 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (20 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (20 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (20 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 20);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (20 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack21_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (21 - 10);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (21 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (21 - 9);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (21 - 19);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (21 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (21 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (21 - 7);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (21 - 17);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 17;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (21 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (21 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (21 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (21 - 15);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 15;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (21 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (21 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (21 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (21 - 13);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 13;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (21 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (21 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (21 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 21);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (21 - 11);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 11;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack22_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (22 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (22 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (22 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (22 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (22 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (22 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (22 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (22 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (22 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (22 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (22 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (22 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (22 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (22 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (22 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (22 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (22 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (22 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 22);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (22 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (22 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack23_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (23 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (23 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (23 - 19);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (23 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (23 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (23 - 15);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 15;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (23 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (23 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (23 - 11);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 11;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (23 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (23 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (23 - 7);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (23 - 21);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (23 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (23 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (23 - 17);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 17;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (23 - 8);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (23 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (23 - 13);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 13;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (23 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 23);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (23 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (23 - 9);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 9;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack24_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 24);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 24);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 24);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 24);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 24);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 24);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 24);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 24);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack25_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 25);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (25 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (25 - 11);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 11;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (25 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 25);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (25 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (25 - 15);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 15;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (25 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (25 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 25);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (25 - 19);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (25 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (25 - 5);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 25);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 23)) << (25 - 23);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (25 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (25 - 9);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 9;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (25 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 25);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (25 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (25 - 13);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 13;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (25 - 6);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 25);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (25 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (25 - 17);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 17;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (25 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (25 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 25);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (25 - 21);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (25 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (25 - 7);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 7;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack26_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 26);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (26 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (26 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (26 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (26 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 26);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (26 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (26 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (26 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (26 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 26);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (26 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (26 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (26 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (26 - 6);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 6;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 26);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (26 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (26 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (26 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (26 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 26);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (26 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (26 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (26 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (26 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 26);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (26 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (26 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (26 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (26 - 6);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 6;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack27_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 27);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (27 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (27 - 17);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 17;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (27 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (27 - 7);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 7;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (27 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 27);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (27 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (27 - 19);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (27 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (27 - 9);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 9;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (27 - 4);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 27);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (27 - 26);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (27 - 21);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (27 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (27 - 11);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 11;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (27 - 6);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 6;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (27 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 27);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 23)) << (27 - 23);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (27 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (27 - 13);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 13;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (27 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (27 - 3);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 27);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 25)) << (27 - 25);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (27 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (27 - 15);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 15;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (27 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (27 - 5);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 5;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack28_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 28);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (28 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (28 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (28 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (28 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (28 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (28 - 4);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 4;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 28);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (28 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (28 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (28 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (28 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (28 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (28 - 4);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 4;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 28);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (28 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (28 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (28 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (28 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (28 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (28 - 4);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 4;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 28);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (28 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (28 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (28 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (28 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (28 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (28 - 4);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 4;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack29_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 29);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (29 - 26);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 23)) << (29 - 23);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (29 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (29 - 17);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 17;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (29 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (29 - 11);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 11;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (29 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (29 - 5);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 5;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (29 - 2);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 29);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 28)) << (29 - 28);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 25)) << (29 - 25);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (29 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (29 - 19);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (29 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (29 - 13);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 13;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (29 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (29 - 7);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 7;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (29 - 4);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 4;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (29 - 1);
-    out = out.offset(1);
-
-    *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 29);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 27)) << (29 - 27);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (29 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (29 - 21);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (29 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (29 - 15);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 15;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (29 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (29 - 9);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 9;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (29 - 6);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 6;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (29 - 3);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 3;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack30_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 30);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 28)) << (30 - 28);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (30 - 26);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (30 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (30 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (30 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (30 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (30 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (30 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (30 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (30 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (30 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (30 - 6);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 6;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (30 - 4);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 4;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (30 - 2);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 2;
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) % (1u32 << 30);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 28)) << (30 - 28);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (30 - 26);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (30 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (30 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (30 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (30 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (30 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (30 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (30 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (30 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (30 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (30 - 6);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 6;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (30 - 4);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 4;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (30 - 2);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 2;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack31_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = (in_buf.read_unaligned()) % (1u32 << 31);
-    out = out.offset(1);
-    *out = (in_buf.read_unaligned()) >> 31;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 30)) << (31 - 30);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 30;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 29)) << (31 - 29);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 29;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 28)) << (31 - 28);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 28;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 27)) << (31 - 27);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 27;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (31 - 26);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 26;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 25)) << (31 - 25);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 25;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (31 - 24);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 24;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 23)) << (31 - 23);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 23;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (31 - 22);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 22;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (31 - 21);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 21;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (31 - 20);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 20;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (31 - 19);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 19;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (31 - 18);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 18;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (31 - 17);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 17;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (31 - 16);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 16;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (31 - 15);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 15;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (31 - 14);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 14;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (31 - 13);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 13;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (31 - 12);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 12;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (31 - 11);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 11;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (31 - 10);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 10;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (31 - 9);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 9;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (31 - 8);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 8;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (31 - 7);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 7;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (31 - 6);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 6;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (31 - 5);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 5;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (31 - 4);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 4;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (31 - 3);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 3;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (31 - 2);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 2;
-    in_buf = in_buf.offset(1);
-    *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (31 - 1);
-    out = out.offset(1);
-
-    *out = (in_buf.read_unaligned()) >> 1;
-
-    in_buf.offset(1)
-}
-
-unsafe fn unpack32_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 
{
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-    in_buf = in_buf.offset(1);
-    out = out.offset(1);
-
-    *out = in_buf.read_unaligned();
-
-    in_buf.offset(1)
-}
diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs
index 84c4d10ed..1dec9b03f 100644
--- a/parquet/src/util/bit_util.rs
+++ b/parquet/src/util/bit_util.rs
@@ -18,7 +18,8 @@
 use std::{cmp, mem::size_of};
 
 use crate::data_type::AsBytes;
-use crate::util::{bit_packing::unpack32, memory::ByteBufferPtr};
+use crate::util::bit_pack::{unpack16, unpack32, unpack64, unpack8};
+use crate::util::memory::ByteBufferPtr;
 
 #[inline]
 pub fn from_ne_slice<T: FromBytes>(bs: &[u8]) -> T {
@@ -457,12 +458,13 @@ impl BitReader {
         true
     }
 
-    /// Read multiple values from their packed representation
+    /// Read multiple values from their packed representation where each 
element is represented
+    /// by `num_bits` bits.
     ///
     /// # Panics
     ///
     /// This function panics if
-    /// - `bit_width` is larger than the bit-capacity of `T`
+    /// - `num_bits` is larger than the bit-capacity of `T`
     ///
     pub fn get_batch<T: FromBytes>(&mut self, batch: &mut [T], num_bits: 
usize) -> usize {
         assert!(num_bits <= size_of::<T>() * 8);
@@ -476,17 +478,6 @@ impl BitReader {
 
         let mut i = 0;
 
-        if num_bits > 32 {
-            // No fast path - read values individually
-            while i < values_to_read {
-                batch[i] = self
-                    .get_value(num_bits)
-                    .expect("expected to have more data");
-                i += 1;
-            }
-            return values_to_read;
-        }
-
         // First align bit offset to byte offset
         if self.bit_offset != 0 {
             while i < values_to_read && self.bit_offset != 0 {
@@ -497,46 +488,104 @@ impl BitReader {
             }
         }
 
-        let in_buf = &self.buffer.data()[self.byte_offset..];
-        let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32;
-        if size_of::<T>() == 4 {
-            while values_to_read - i >= 32 {
-                let out_ptr = &mut batch[i..] as *mut [T] as *mut T as *mut 
u32;
-                in_ptr = unsafe { unpack32(in_ptr, out_ptr, num_bits) };
-                self.byte_offset += 4 * num_bits;
-                i += 32;
+        let in_buf = self.buffer.data();
+
+        // Read directly into output buffer
+        match size_of::<T>() {
+            1 => {
+                let ptr = batch.as_mut_ptr() as *mut u8;
+                let out = unsafe { std::slice::from_raw_parts_mut(ptr, 
batch.len()) };
+                while values_to_read - i >= 8 {
+                    let out_slice = (&mut out[i..i + 8]).try_into().unwrap();
+                    unpack8(&in_buf[self.byte_offset..], out_slice, num_bits);
+                    self.byte_offset += num_bits;
+                    i += 8;
+                }
             }
-        } else {
-            let mut out_buf = [0u32; 32];
-            let out_ptr = &mut out_buf as &mut [u32] as *mut [u32] as *mut u32;
-            while values_to_read - i >= 32 {
-                in_ptr = unsafe { unpack32(in_ptr, out_ptr, num_bits) };
-                self.byte_offset += 4 * num_bits;
-
-                for out in out_buf {
-                    // Zero-allocate buffer
-                    let mut out_bytes = T::Buffer::default();
-                    let in_bytes = out.to_le_bytes();
-
-                    {
-                        let out_bytes = out_bytes.as_mut();
-                        let len = out_bytes.len().min(in_bytes.len());
-                        (&mut 
out_bytes[..len]).copy_from_slice(&in_bytes[..len]);
-                    }
-
-                    batch[i] = T::from_le_bytes(out_bytes);
-                    i += 1;
+            2 => {
+                let ptr = batch.as_mut_ptr() as *mut u16;
+                let out = unsafe { std::slice::from_raw_parts_mut(ptr, 
batch.len()) };
+                while values_to_read - i >= 16 {
+                    let out_slice = (&mut out[i..i + 16]).try_into().unwrap();
+                    unpack16(&in_buf[self.byte_offset..], out_slice, num_bits);
+                    self.byte_offset += 2 * num_bits;
+                    i += 16;
                 }
             }
+            4 => {
+                let ptr = batch.as_mut_ptr() as *mut u32;
+                let out = unsafe { std::slice::from_raw_parts_mut(ptr, 
batch.len()) };
+                while values_to_read - i >= 32 {
+                    let out_slice = (&mut out[i..i + 32]).try_into().unwrap();
+                    unpack32(&in_buf[self.byte_offset..], out_slice, num_bits);
+                    self.byte_offset += 4 * num_bits;
+                    i += 32;
+                }
+            }
+            8 => {
+                let ptr = batch.as_mut_ptr() as *mut u64;
+                let out = unsafe { std::slice::from_raw_parts_mut(ptr, 
batch.len()) };
+                while values_to_read - i >= 64 {
+                    let out_slice = (&mut out[i..i + 64]).try_into().unwrap();
+                    unpack64(&in_buf[self.byte_offset..], out_slice, num_bits);
+                    self.byte_offset += 8 * num_bits;
+                    i += 64;
+                }
+            }
+            _ => unreachable!(),
+        }
+
+        // Try to read smaller batches if possible
+        if size_of::<T>() > 4 && values_to_read - i >= 32 && num_bits <= 32 {
+            let mut out_buf = [0_u32; 32];
+            unpack32(&in_buf[self.byte_offset..], &mut out_buf, num_bits);
+            self.byte_offset += 4 * num_bits;
+
+            for out in out_buf {
+                // Zero-allocate buffer
+                let mut out_bytes = T::Buffer::default();
+                out_bytes.as_mut()[..4].copy_from_slice(&out.to_le_bytes());
+                batch[i] = T::from_le_bytes(out_bytes);
+                i += 1;
+            }
+        }
+
+        if size_of::<T>() > 2 && values_to_read - i >= 16 && num_bits <= 16 {
+            let mut out_buf = [0_u16; 16];
+            unpack16(&in_buf[self.byte_offset..], &mut out_buf, num_bits);
+            self.byte_offset += 2 * num_bits;
+
+            for out in out_buf {
+                // Zero-allocate buffer
+                let mut out_bytes = T::Buffer::default();
+                out_bytes.as_mut()[..2].copy_from_slice(&out.to_le_bytes());
+                batch[i] = T::from_le_bytes(out_bytes);
+                i += 1;
+            }
         }
 
-        assert!(values_to_read - i < 32);
+        if size_of::<T>() > 1 && values_to_read - i >= 8 && num_bits <= 8 {
+            let mut out_buf = [0_u8; 8];
+            unpack8(&in_buf[self.byte_offset..], &mut out_buf, num_bits);
+            self.byte_offset += num_bits;
+
+            for out in out_buf {
+                // Zero-allocate buffer
+                let mut out_bytes = T::Buffer::default();
+                out_bytes.as_mut()[..1].copy_from_slice(&out.to_le_bytes());
+                batch[i] = T::from_le_bytes(out_bytes);
+                i += 1;
+            }
+        }
 
         self.reload_buffer_values();
+
+        // Read any trailing values
         while i < values_to_read {
-            batch[i] = self
+            let value = self
                 .get_value(num_bits)
                 .expect("expected to have more data");
+            batch[i] = value;
             i += 1;
         }
 
@@ -1014,11 +1063,12 @@ mod tests {
     fn test_get_batch() {
         const SIZE: &[usize] = &[1, 31, 32, 33, 128, 129];
         for s in SIZE {
-            for i in 0..33 {
+            for i in 0..=64 {
                 match i {
                     0..=8 => test_get_batch_helper::<u8>(*s, i),
                     9..=16 => test_get_batch_helper::<u16>(*s, i),
-                    _ => test_get_batch_helper::<u32>(*s, i),
+                    17..=32 => test_get_batch_helper::<u32>(*s, i),
+                    _ => test_get_batch_helper::<u64>(*s, i),
                 }
             }
         }
@@ -1028,13 +1078,18 @@ mod tests {
     where
         T: FromBytes + Default + Clone + Debug + Eq,
     {
-        assert!(num_bits <= 32);
+        assert!(num_bits <= 64);
         let num_bytes = ceil(num_bits, 8);
         let mut writer = BitWriter::new(num_bytes as usize * total);
 
-        let values: Vec<u32> = random_numbers::<u32>(total)
+        let mask = match num_bits {
+            64 => u64::MAX,
+            _ => (1 << num_bits) - 1,
+        };
+
+        let values: Vec<u64> = random_numbers::<u64>(total)
             .iter()
-            .map(|v| v & ((1u64 << num_bits) - 1) as u32)
+            .map(|v| v & mask)
             .collect();
 
         // Generic values used to check against actual values read from 
`get_batch`.
@@ -1050,9 +1105,12 @@ mod tests {
         assert_eq!(values_read, values.len());
         for i in 0..batch.len() {
             assert_eq!(
-                batch[i], expected_values[i],
-                "num_bits = {}, index = {}",
-                num_bits, i
+                batch[i],
+                expected_values[i],
+                "max_num_bits = {}, num_bits = {}, index = {}",
+                size_of::<T>() * 8,
+                num_bits,
+                i
             );
         }
     }
diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs
index 01ac39116..8510b1c2c 100644
--- a/parquet/src/util/mod.rs
+++ b/parquet/src/util/mod.rs
@@ -19,7 +19,7 @@ pub mod io;
 pub mod memory;
 #[macro_use]
 pub mod bit_util;
-mod bit_packing;
+mod bit_pack;
 pub mod cursor;
 pub(crate) mod interner;
 pub(crate) mod page_util;

Reply via email to