wgtmac commented on code in PR #1375:
URL: https://github.com/apache/orc/pull/1375#discussion_r1139740147


##########
.github/workflows/build_and_test.yml:
##########
@@ -91,6 +91,49 @@ jobs:
         cmake --build . --config Debug
         ctest -C Debug --output-on-failure
 
+  simdUbuntu:
+    name: "SIMD programming using C++ intrinsic functions on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-22.04
+        cxx:
+          - clang++
+    env:
+      ORC_USER_SIMD_LEVEL: avx512

Review Comment:
   Is it case-insensitive? I prefer uppercase here.



##########
.github/workflows/build_and_test.yml:
##########
@@ -91,6 +91,49 @@ jobs:
         cmake --build . --config Debug
         ctest -C Debug --output-on-failure
 
+  simdUbuntu:
+    name: "SIMD programming using C++ intrinsic functions on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-22.04
+        cxx:
+          - clang++
+    env:
+      ORC_USER_SIMD_LEVEL: avx512
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: "Test"
+      run: |
+        mkdir -p ~/.m2
+        mkdir build
+        cd build
+        cmake -DBUILD_JAVA=OFF -DBUILD_ENABLE_AVX512=ON ..
+        make package test-out
+
+  simdWindows:
+    name: "SIMD programming using C++ intrinsic functions on Windows"
+    runs-on: windows-2019
+    env:
+      ORC_USER_SIMD_LEVEL: avx512

Review Comment:
   ditto



##########
c++/src/BpackingAvx512.hh:
##########
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BPACKINGAVX512_HH
+#define ORC_BPACKINGAVX512_HH
+
+#include <stdlib.h>
+#include <cstdint>
+
+#include "BpackingDefault.hh"
+#include "Dispatch.hh"
+#include "RLEv2.hh"

Review Comment:
   We'd better use forward declaration and remove unnecessary inclusion.



##########
c++/src/BitUnpackerAvx512.hh:
##########
@@ -0,0 +1,488 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BIT_UNPACKER_AVX512_HH
+#define ORC_BIT_UNPACKER_AVX512_HH
+
+// Mingw-w64 defines strcasecmp in string.h
+#if defined(_WIN32) && !defined(strcasecmp)
+#include <string.h>
+#define strcasecmp stricmp
+#else
+#include <strings.h>
+#endif
+
+#include <immintrin.h>
+#include <cstdint>
+#include <vector>
+
+namespace orc {
+#define ORC_VECTOR_BITS_2_BYTE(x) \
+  (((x) + 7u) >> 3u) /**< Convert a number of bits to a number of bytes */
+#define ORC_VECTOR_ONE_64U (1ULL)
+#define ORC_VECTOR_MAX_16U 0xFFFF     /**< Max value for uint16_t */
+#define ORC_VECTOR_MAX_32U 0xFFFFFFFF /**< Max value for uint32_t */
+#define ORC_VECTOR_BYTE_WIDTH 8u      /**< Byte width in bits */
+#define ORC_VECTOR_WORD_WIDTH 16u     /**< Word width in bits */
+#define ORC_VECTOR_DWORD_WIDTH 32u    /**< Dword width in bits */
+#define ORC_VECTOR_QWORD_WIDTH 64u    /**< Qword width in bits */
+#define ORC_VECTOR_BIT_MASK(x) \
+  ((ORC_VECTOR_ONE_64U << (x)) - 1u) /**< Bit mask below bit position */
+
+#define ORC_VECTOR_BITS_2_WORD(x) \
+  (((x) + 15u) >> 4u) /**< Convert a number of bits to a number of words */
+#define ORC_VECTOR_BITS_2_DWORD(x) \
+  (((x) + 31u) >> 5u) /**< Convert a number of bits to a number of double 
words */
+
+  // ------------------------------------ 3u 
-----------------------------------------
+  static uint8_t shuffleIdxTable3u_0[64] = {
+      1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 
1u, 0u, 2u, 1u,
+      3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 
4u, 3u, 4u, 3u,
+      5u, 4u, 6u, 5u, 1u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 4u, 3u, 5u, 4u, 
6u, 5u};
+  static uint8_t shuffleIdxTable3u_1[64] = {
+      0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 
1u, 0u, 2u, 1u,
+      3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 
3u, 2u, 4u, 3u,
+      5u, 4u, 6u, 5u, 0u, 0u, 1u, 0u, 2u, 1u, 3u, 2u, 3u, 2u, 4u, 3u, 5u, 4u, 
6u, 5u};
+  static uint16_t shiftTable3u_0[32] = {13u, 7u,  9u,  11u, 13u, 7u,  9u,  
11u, 13u, 7u,  9u,
+                                        11u, 13u, 7u,  9u,  11u, 13u, 7u,  9u, 
 11u, 13u, 7u,
+                                        9u,  11u, 13u, 7u,  9u,  11u, 13u, 7u, 
 9u,  11u};
+  static uint16_t shiftTable3u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 
4u, 2u,
+                                        0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 
6u, 4u,
+                                        2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 
0u};
+  static uint16_t permutexIdxTable3u[32] = {0u,  1u,  2u,  0x0, 0x0, 0x0, 0x0, 
0x0, 3u,  4u,  5u,
+                                            0x0, 0x0, 0x0, 0x0, 0x0, 6u,  7u,  
8u,  0x0, 0x0, 0x0,
+                                            0x0, 0x0, 9u,  10u, 11u, 0x0, 0x0, 
0x0, 0x0, 0x0};
+
+  // ------------------------------------ 5u 
-----------------------------------------
+  static uint8_t shuffleIdxTable5u_0[64] = {
+      1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 
2u, 1u, 3u, 2u,
+      4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 
6u, 5u, 7u, 6u,
+      8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 
9u, 8u};
+  static uint8_t shuffleIdxTable5u_1[64] = {
+      1u, 0u, 2u,  1u, 3u, 2u, 5u, 4u, 6u,  5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 
0u, 2u,  1u, 3u, 2u,
+      5u, 4u, 6u,  5u, 7u, 6u, 8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u,  2u, 5u, 
4u, 6u,  5u, 7u, 6u,
+      8u, 7u, 10u, 9u, 1u, 0u, 2u, 1u, 3u,  2u, 5u, 4u, 6u, 5u, 7u,  6u, 8u, 
7u, 10u, 9u};
+  static uint16_t shiftTable5u_0[32] = {11u, 9u,  7u,  5u, 11u, 9u,  7u,  5u, 
11u, 9u,  7u,
+                                        5u,  11u, 9u,  7u, 5u,  11u, 9u,  7u, 
5u,  11u, 9u,
+                                        7u,  5u,  11u, 9u, 7u,  5u,  11u, 9u, 
7u,  5u};
+  static uint16_t shiftTable5u_1[32] = {2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 2u, 
4u, 6u,
+                                        0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 0u, 
2u, 4u,
+                                        6u, 0u, 2u, 4u, 6u, 0u, 2u, 4u, 6u, 
0u};
+  static uint16_t permutexIdxTable5u[32] = {0u,  1u,  2u,  3u,  4u,  0x0, 0x0, 
0x0, 5u,  6u,  7u,
+                                            8u,  9u,  0x0, 0x0, 0x0, 10u, 11u, 
12u, 13u, 14u, 0x0,
+                                            0x0, 0x0, 15u, 16u, 17u, 18u, 19u, 
0x0, 0x0, 0x0};
+
+  // ------------------------------------ 6u 
-----------------------------------------
+  static uint8_t shuffleIdxTable6u_0[64] = {
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u};
+  static uint8_t shuffleIdxTable6u_1[64] = {
+      1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u,
+      1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u,
+      1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u,
+      1u, 0u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 9u, 8u, 10u, 9u, 12u, 11u};
+  static uint16_t shiftTable6u_0[32] = {10u, 6u,  10u, 6u,  10u, 6u,  10u, 6u, 
 10u, 6u,  10u,
+                                        6u,  10u, 6u,  10u, 6u,  10u, 6u,  
10u, 6u,  10u, 6u,
+                                        10u, 6u,  10u, 6u,  10u, 6u,  10u, 6u, 
 10u, 6u};
+  static uint16_t shiftTable6u_1[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 
0u, 4u,
+                                        0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 
4u, 0u,
+                                        4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 
0u};
+  static uint32_t permutexIdxTable6u[16] = {0u, 1u, 2u, 0x0, 3u, 4u,  5u,  0x0,
+                                            6u, 7u, 8u, 0x0, 9u, 10u, 11u, 
0x0};
+
+  // ------------------------------------ 7u 
-----------------------------------------
+  static uint8_t shuffleIdxTable7u_0[64] = {
+      1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u,
+      1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u,
+      1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u,
+      1u, 0u, 2u, 1u, 4u, 3u, 6u, 5u, 8u, 7u, 9u, 8u, 11u, 10u, 13u, 12u};
+  static uint8_t shuffleIdxTable7u_1[64] = {
+      1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u,
+      1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u,
+      1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u,
+      1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 12u, 11u, 14u, 13u};
+  static uint16_t shiftTable7u_0[32] = {9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 9u, 
3u, 5u,
+                                        7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 7u, 
9u, 3u,
+                                        5u, 7u, 9u, 3u, 5u, 7u, 9u, 3u, 5u, 
7u};
+  static uint16_t shiftTable7u_1[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 
4u, 2u,
+                                        0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 
6u, 4u,
+                                        2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 
0u};
+  static uint16_t permutexIdxTable7u[32] = {0u,  1u,  2u,  3u,  4u,  5u,  6u,  
0x0, 7u,  8u,  9u,
+                                            10u, 11u, 12u, 13u, 0x0, 14u, 15u, 
16u, 17u, 18u, 19u,
+                                            20u, 0x0, 21u, 22u, 23u, 24u, 25u, 
26u, 27u, 0x0};
+
+  // ------------------------------------ 9u 
-----------------------------------------
+  static uint16_t permutexIdxTable9u_0[32] = {0u,  1u,  1u,  2u,  2u,  3u,  
3u,  4u,  4u,  5u,  5u,
+                                              6u,  6u,  7u,  7u,  8u,  9u,  
10u, 10u, 11u, 11u, 12u,
+                                              12u, 13u, 13u, 14u, 14u, 15u, 
15u, 16u, 16u, 17u};
+  static uint16_t permutexIdxTable9u_1[32] = {0u,  1u,  1u,  2u,  2u,  3u,  
3u,  4u,  5u,  6u,  6u,
+                                              7u,  7u,  8u,  8u,  9u,  9u,  
10u, 10u, 11u, 11u, 12u,
+                                              12u, 13u, 14u, 15u, 15u, 16u, 
16u, 17u, 17u, 18u};
+  static uint32_t shiftTable9u_0[16] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+                                        0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u};
+  static uint32_t shiftTable9u_1[16] = {7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u,
+                                        7u, 5u, 3u, 1u, 15u, 13u, 11u, 9u};
+
+  static uint8_t shuffleIdxTable9u_0[64] = {
+      1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 
2u, 1u, 3u, 2u,
+      4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 
5u, 4u, 6u, 5u,
+      7u, 6u, 8u, 7u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 5u, 4u, 6u, 5u, 7u, 6u, 
8u, 7u};
+  static uint16_t shiftTable9u_2[32] = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 7u, 
6u, 5u,
+                                        4u, 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 
3u, 2u,
+                                        1u, 0u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 
0u};
+  static uint64_t gatherIdxTable9u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u};
+
+  // ------------------------------------ 10u 
-----------------------------------------
+  static uint8_t shuffleIdxTable10u_0[64] = {
+      1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 
2u, 1u, 3u, 2u,
+      4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 
6u, 5u, 7u, 6u,
+      8u, 7u, 9u, 8u, 1u, 0u, 2u, 1u, 3u, 2u, 4u, 3u, 6u, 5u, 7u, 6u, 8u, 7u, 
9u, 8u};
+  static uint16_t shiftTable10u[32] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 
2u,
+                                       0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u, 6u, 
4u,
+                                       2u, 0u, 6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u};
+  static uint16_t permutexIdxTable10u[32] = {0u,  1u,  2u,  3u,  4u,  0x0, 
0x0, 0x0, 5u,  6u,  7u,
+                                             8u,  9u,  0x0, 0x0, 0x0, 10u, 
11u, 12u, 13u, 14u, 0x0,
+                                             0x0, 0x0, 15u, 16u, 17u, 18u, 
19u, 0x0, 0x0, 0x0};
+
+  // ------------------------------------ 11u 
-----------------------------------------
+  static uint16_t permutexIdxTable11u_0[32] = {
+      0u,  1u,  1u,  2u,  2u,  3u,  4u,  5u,  5u,  6u,  6u,  7u,  8u,  9u,  
9u,  10u,
+      11u, 12u, 12u, 13u, 13u, 14u, 15u, 16u, 16u, 17u, 17u, 18u, 19u, 20u, 
20u, 21u};
+  static uint16_t permutexIdxTable11u_1[32] = {
+      0u,  1u,  2u,  3u,  3u,  4u,  4u,  5u,  6u,  7u,  7u,  8u,  8u,  9u,  
10u, 11u,
+      11u, 12u, 13u, 14u, 14u, 15u, 15u, 16u, 17u, 18u, 18u, 19u, 19u, 20u, 
21u, 22u};
+  static uint32_t shiftTable11u_0[16] = {0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u,
+                                         0u, 6u, 12u, 2u, 8u, 14u, 4u, 10u};
+  static uint32_t shiftTable11u_1[16] = {5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u,
+                                         5u, 15u, 9u, 3u, 13u, 7u, 1u, 11u};
+
+  static uint8_t shuffleIdxTable11u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u};
+  static uint8_t shuffleIdxTable11u_1[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 8u, 7u, 6u, 0u, 11u, 10u, 9u, 0u};
+  static uint32_t shiftTable11u_2[16] = {21u, 15u, 17u, 19u, 21u, 15u, 17u, 
19u,
+                                         21u, 15u, 17u, 19u, 21u, 15u, 17u, 
19u};
+  static uint32_t shiftTable11u_3[16] = {6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u,
+                                         6u, 4u, 10u, 8u, 6u, 4u, 10u, 8u};
+  static uint64_t gatherIdxTable11u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 
41u};
+
+  // ------------------------------------ 12u 
-----------------------------------------
+  static uint8_t shuffleIdxTable12u_0[64] = {
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u,
+      1u, 0u, 2u, 1u, 4u, 3u, 5u, 4u, 7u, 6u, 8u, 7u, 10u, 9u, 11u, 10u};
+  static uint16_t shiftTable12u[32] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 
4u,
+                                       0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 
0u,
+                                       4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u};
+  static uint32_t permutexIdxTable12u[16] = {0u, 1u, 2u, 0x0, 3u, 4u,  5u,  
0x0,
+                                             6u, 7u, 8u, 0x0, 9u, 10u, 11u, 
0x0};
+
+  // ------------------------------------ 13u 
-----------------------------------------
+  static uint16_t permutexIdxTable13u_0[32] = {
+      0u,  1u,  1u,  2u,  3u,  4u,  4u,  5u,  6u,  7u,  8u,  9u,  9u,  10u, 
11u, 12u,
+      13u, 14u, 14u, 15u, 16u, 17u, 17u, 18u, 19u, 20u, 21u, 22u, 22u, 23u, 
24u, 25u};
+  static uint16_t permutexIdxTable13u_1[32] = {
+      0u,  1u,  2u,  3u,  4u,  5u,  5u,  6u,  7u,  8u,  8u,  9u,  10u, 11u, 
12u, 13u,
+      13u, 14u, 15u, 16u, 17u, 18u, 18u, 19u, 20u, 21u, 21u, 22u, 23u, 24u, 
25u, 26u};
+  static uint32_t shiftTable13u_0[16] = {0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u,
+                                         0u, 10u, 4u, 14u, 8u, 2u, 12u, 6u};
+  static uint32_t shiftTable13u_1[16] = {3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u,
+                                         3u, 9u, 15u, 5u, 11u, 1u, 7u, 13u};
+
+  static uint8_t shuffleIdxTable13u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u};
+  static uint8_t shuffleIdxTable13u_1[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 0u, 10u, 9u, 8u, 0u, 13u, 12u, 11u, 0u};
+  static uint32_t shiftTable13u_2[16] = {19u, 17u, 15u, 13u, 19u, 17u, 15u, 
13u,
+                                         19u, 17u, 15u, 13u, 19u, 17u, 15u, 
13u};
+  static uint32_t shiftTable13u_3[16] = {10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u,
+                                         10u, 12u, 6u, 8u, 10u, 12u, 6u, 8u};
+  static uint64_t gatherIdxTable13u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 
47u};
+
+  // ------------------------------------ 14u 
-----------------------------------------
+  static uint8_t shuffleIdxTable14u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u};
+  static uint8_t shuffleIdxTable14u_1[64] = {
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 10u, 9u, 8u, 0u, 14u, 13u, 12u, 0u};
+  static uint32_t shiftTable14u_0[16] = {18u, 14u, 18u, 14u, 18u, 14u, 18u, 
14u,
+                                         18u, 14u, 18u, 14u, 18u, 14u, 18u, 
14u};
+  static uint32_t shiftTable14u_1[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u,
+                                         12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u};
+  static uint16_t permutexIdxTable14u[32] = {0u,  1u,  2u,  3u,  4u,  5u,  6u, 
 0x0, 7u,  8u,  9u,
+                                             10u, 11u, 12u, 13u, 0x0, 14u, 
15u, 16u, 17u, 18u, 19u,
+                                             20u, 0x0, 21u, 22u, 23u, 24u, 
25u, 26u, 27u, 0x0};
+
+  // ------------------------------------ 15u 
-----------------------------------------
+  static uint16_t permutexIdxTable15u_0[32] = {
+      0u,  1u,  1u,  2u,  3u,  4u,  5u,  6u,  7u,  8u,  9u,  10u, 11u, 12u, 
13u, 14u,
+      15u, 16u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 
28u, 29u};
+  static uint16_t permutexIdxTable15u_1[32] = {
+      0u,  1u,  2u,  3u,  4u,  5u,  6u,  7u,  8u,  9u,  10u, 11u, 12u, 13u, 
14u, 15u,
+      15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u, 
29u, 30u};
+  static uint32_t shiftTable15u_0[16] = {0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u,
+                                         0u, 14u, 12u, 10u, 8u, 6u, 4u, 2u};
+  static uint32_t shiftTable15u_1[16] = {1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u,
+                                         1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+
+  static uint8_t shuffleIdxTable15u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 14u, 13u, 12u, 11u};
+  static uint8_t shuffleIdxTable15u_1[64] = {
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u,
+      3u, 2u, 1u, 0u, 7u, 6u, 5u, 0u, 11u, 10u, 9u, 0u, 15u, 14u, 13u, 0u};
+  static uint32_t shiftTable15u_2[16] = {17u, 11u, 13u, 15u, 17u, 11u, 13u, 
15u,
+                                         17u, 11u, 13u, 15u, 17u, 11u, 13u, 
15u};
+  static uint32_t shiftTable15u_3[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u,
+                                         14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u};
+  static uint64_t gatherIdxTable15u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 
53u};
+
+  // ------------------------------------ 17u 
-----------------------------------------
+  static uint32_t permutexIdxTable17u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u};
+  static uint32_t permutexIdxTable17u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u};
+  static uint64_t shiftTable17u_0[8] = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u};
+  static uint64_t shiftTable17u_1[8] = {15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+
+  static uint8_t shuffleIdxTable17u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 
1u, 0u, 5u, 4u,
+      3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 
7u, 6u, 5u, 4u,
+      9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 
7u, 6u};
+  static uint32_t shiftTable17u_2[16] = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                         15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u};
+  static uint64_t gatherIdxTable17u[8] = {0u, 8u, 8u, 16u, 17u, 25u, 25u, 33u};
+
+  // ------------------------------------ 18u 
-----------------------------------------
+  static uint32_t permutexIdxTable18u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               4u, 5u, 5u, 6u, 6u, 7u, 7u, 8u};
+  static uint32_t permutexIdxTable18u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u};
+  static uint64_t shiftTable18u_0[8] = {0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u};
+  static uint64_t shiftTable18u_1[8] = {14u, 10u, 6u, 2u, 30u, 26u, 22u, 18u};
+
+  static uint8_t shuffleIdxTable18u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 
1u, 0u, 5u, 4u,
+      3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 
7u, 6u, 5u, 4u,
+      9u, 8u, 7u, 6u, 3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 7u, 6u, 5u, 4u, 9u, 8u, 
7u, 6u};
+  static uint32_t shiftTable18u_2[16] = {14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u,
+                                         14u, 12u, 10u, 8u, 14u, 12u, 10u, 8u};
+  static uint64_t gatherIdxTable18u[8] = {0u, 8u, 9u, 17u, 18u, 26u, 27u, 35u};
+
+  // ------------------------------------ 19u 
-----------------------------------------
+  static uint32_t permutexIdxTable19u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               4u, 5u, 5u, 6u, 7u, 8u, 8u, 9u};
+  static uint32_t permutexIdxTable19u_1[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u,
+                                               5u, 6u, 6u, 7u, 7u, 8u, 8u, 9u};
+  static uint64_t shiftTable19u_0[8] = {0u, 6u, 12u, 18u, 24u, 30u, 4u, 10u};
+  static uint64_t shiftTable19u_1[8] = {13u, 7u, 1u, 27u, 21u, 15u, 9u, 3u};
+
+  static uint8_t shuffleIdxTable19u_0[64] = {
+      3u,  2u, 1u, 0u, 5u, 4u, 3u,  2u, 7u, 6u, 5u, 4u, 10u, 9u, 8u, 7u, 3u,  
2u, 1u, 0u, 5u, 4u,
+      3u,  2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u, 5u, 4u, 3u,  
2u, 7u, 6u, 5u, 4u,
+      10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u, 5u, 4u, 3u, 2u, 8u,  7u, 6u, 5u, 10u, 
9u, 8u, 7u};
+  static uint32_t shiftTable19u_2[16] = {13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u,
+                                         13u, 10u, 7u, 12u, 9u, 6u, 11u, 8u};
+  static uint64_t gatherIdxTable19u[8] = {0u, 8u, 9u, 17u, 19u, 27u, 28u, 36u};
+
+  // ------------------------------------ 20u 
-----------------------------------------
+  static uint8_t shuffleIdxTable20u_0[64] = {
+      3u,  2u, 1u, 0u, 5u, 4u, 3u,  2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u,  
2u, 1u, 0u, 5u, 4u,
+      3u,  2u, 8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u, 5u, 4u, 3u,  
2u, 8u, 7u, 6u, 5u,
+      10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u, 5u, 4u, 3u, 2u, 8u,  7u, 6u, 5u, 10u, 
9u, 8u, 7u};
+  static uint32_t shiftTable20u[16] = {12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u,
+                                       12u, 8u, 12u, 8u, 12u, 8u, 12u, 8u};
+  static uint16_t permutexIdxTable20u[32] = {0u,  1u,  2u,  3u,  4u,  0x0, 
0x0, 0x0, 5u,  6u,  7u,
+                                             8u,  9u,  0x0, 0x0, 0x0, 10u, 
11u, 12u, 13u, 14u, 0x0,
+                                             0x0, 0x0, 15u, 16u, 17u, 18u, 
19u, 0x0, 0x0, 0x0};
+
+  // ------------------------------------ 21u 
-----------------------------------------
+  static uint32_t permutexIdxTable21u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 3u, 4u,
+                                               5u, 6u, 6u, 7u, 7u, 8u, 9u, 
10u};
+  static uint32_t permutexIdxTable21u_1[16] = {0u, 1u, 1u, 2u, 3u, 4u, 4u, 5u,
+                                               5u, 6u, 7u, 8u, 8u, 9u, 9u, 
10u};
+  static uint64_t shiftTable21u_0[8] = {0u, 10u, 20u, 30u, 8u, 18u, 28u, 6u};
+  static uint64_t shiftTable21u_1[8] = {11u, 1u, 23u, 13u, 3u, 25u, 15u, 5u};
+
+  static uint8_t shuffleIdxTable21u_0[64] = {
+      3u,  2u, 1u, 0u, 5u, 4u, 3u,  2u,  8u, 7u, 6u, 5u, 10u, 9u, 8u, 7u, 3u,  
2u,  1u, 0u, 6u, 5u,
+      4u,  3u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u, 3u, 2u, 1u,  0u, 5u, 4u, 3u,  
2u,  8u, 7u, 6u, 5u,
+      10u, 9u, 8u, 7u, 3u, 2u, 1u,  0u,  6u, 5u, 4u, 3u, 8u,  7u, 6u, 5u, 11u, 
10u, 9u, 8u};
+  static uint32_t shiftTable21u_2[16] = {11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u,
+                                         11u, 6u, 9u, 4u, 7u, 10u, 5u, 8u};
+  static uint64_t gatherIdxTable21u[8] = {0u, 8u, 10u, 18u, 21u, 29u, 31u, 
39u};
+
+  // ------------------------------------ 22u 
-----------------------------------------
+  static uint32_t permutexIdxTable22u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u, 5u,
+                                               5u, 6u, 6u, 7u, 8u, 9u, 9u, 
10u};
+  static uint32_t permutexIdxTable22u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u, 4u,  5u,
+                                               6u, 7u, 7u, 8u, 8u, 9u, 10u, 
11u};
+  static uint64_t shiftTable22u_0[8] = {0u, 12u, 24u, 4u, 16u, 28u, 8u, 20u};
+  static uint64_t shiftTable22u_1[8] = {10u, 30u, 18u, 6u, 26u, 14u, 2u, 22u};
+
+  static uint8_t shuffleIdxTable22u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u, 8u};
+  static uint32_t shiftTable22u_2[16] = {10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u,
+                                         10u, 4u, 6u, 8u, 10u, 4u, 6u, 8u};
+  static uint64_t gatherIdxTable22u[8] = {0u, 8u, 11u, 19u, 22u, 30u, 33u, 
41u};
+
+  // ------------------------------------ 23u 
-----------------------------------------
+  static uint32_t permutexIdxTable23u_0[16] = {0u, 1u, 1u, 2u, 2u, 3u, 4u,  5u,
+                                               5u, 6u, 7u, 8u, 8u, 9u, 10u, 
11u};
+  static uint32_t permutexIdxTable23u_1[16] = {0u, 1u, 2u, 3u, 3u, 4u,  5u,  
6u,
+                                               6u, 7u, 7u, 8u, 9u, 10u, 10u, 
11u};
+  static uint64_t shiftTable23u_0[8] = {0u, 14u, 28u, 10u, 24u, 6u, 20u, 2u};
+  static uint64_t shiftTable23u_1[8] = {9u, 27u, 13u, 31u, 17u, 3u, 21u, 7u};
+
+  static uint8_t shuffleIdxTable23u_0[64] = {
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u,  8u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 5u, 4u, 3u, 2u, 8u, 7u, 6u, 5u, 11u, 10u, 9u,  8u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u};
+  static uint32_t shiftTable23u_2[16] = {9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u,
+                                         9u, 2u, 3u, 4u, 5u, 6u, 7u, 8u};
+  static uint64_t gatherIdxTable23u[8] = {0u, 8u, 11u, 19u, 23u, 31u, 34u, 
42u};
+
+  // ------------------------------------ 24u 
-----------------------------------------
+  static uint8_t shuffleIdxTable24u_0[64] = {
+      2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF,
+      2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF,
+      2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 0xFF,
+      2u, 1u, 0u, 0xFF, 5u, 4u, 3u, 0xFF, 8u, 7u, 6u, 0xFF, 11u, 10u, 9u, 
0xFF};
+  static uint32_t permutexIdxTable24u[16] = {0u, 1u, 2u, 0x0, 3u, 4u,  5u,  
0x0,
+                                             6u, 7u, 8u, 0x0, 9u, 10u, 11u, 
0x0};
+
+  // ------------------------------------ 26u 
-----------------------------------------
+  static uint32_t permutexIdxTable26u_0[16] = {0u, 1u, 1u, 2u, 3u, 4u,  4u,  
5u,
+                                               6u, 7u, 8u, 9u, 9u, 10u, 11u, 
12u};
+  static uint32_t permutexIdxTable26u_1[16] = {0u, 1u, 2u, 3u, 4u,  5u,  5u,  
6u,
+                                               7u, 8u, 8u, 9u, 10u, 11u, 12u, 
13u};
+  static uint64_t shiftTable26u_0[8] = {0u, 20u, 8u, 28u, 16u, 4u, 24u, 12u};
+  static uint64_t shiftTable26u_1[8] = {6u, 18u, 30u, 10u, 22u, 2u, 14u, 26u};
+
+  static uint8_t shuffleIdxTable26u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 9u, 8u, 7u, 6u, 12u, 11u, 10u, 9u};
+  static uint32_t shiftTable26u_2[16] = {6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u,
+                                         6u, 4u, 2u, 0u, 6u, 4u, 2u, 0u};
+  static uint64_t gatherIdxTable26u[8] = {0u, 8u, 13u, 21u, 26u, 34u, 39u, 
47u};
+
+  // ------------------------------------ 28u 
-----------------------------------------
+  static uint8_t shuffleIdxTable28u_0[64] = {
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u,
+      3u, 2u, 1u, 0u, 6u, 5u, 4u, 3u, 10u, 9u, 8u, 7u, 13u, 12u, 11u, 10u};
+  static uint32_t shiftTable28u[16] = {4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u,
+                                       4u, 0u, 4u, 0u, 4u, 0u, 4u, 0u};
+  static uint16_t permutexIdxTable28u[32] = {0u,  1u,  2u,  3u,  4u,  5u,  6u, 
 0x0, 7u,  8u,  9u,
+                                             10u, 11u, 12u, 13u, 0x0, 14u, 
15u, 16u, 17u, 18u, 19u,
+                                             20u, 0x0, 21u, 22u, 23u, 24u, 
25u, 26u, 27u, 0x0};
+
+  // ------------------------------------ 30u 
-----------------------------------------
+  static uint32_t permutexIdxTable30u_0[16] = {0u, 1u, 1u, 2u,  3u,  4u,  5u,  
6u,
+                                               7u, 8u, 9u, 10u, 11u, 12u, 13u, 
14u};
+  static uint32_t permutexIdxTable30u_1[16] = {0u, 1u, 2u,  3u,  4u,  5u,  6u, 
 7u,
+                                               8u, 9u, 10u, 11u, 12u, 13u, 
14u, 15u};
+  static uint64_t shiftTable30u_0[8] = {0u, 28u, 24u, 20u, 16u, 12u, 8u, 4u};
+  static uint64_t shiftTable30u_1[8] = {2u, 6u, 10u, 14u, 18u, 22u, 26u, 30u};
+
+  static uint8_t shuffleIdxTable30u_0[64] = {
+      0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u,
+      0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u,
+      0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u,
+      0u, 0u, 0u, 4u, 3u, 2u, 1u, 0u, 0u, 0u, 0u, 11u, 10u, 9u, 8u, 7u};
+  static uint8_t shuffleIdxTable30u_1[64] = {
+      7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u,
+      7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u,
+      7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u,
+      7u, 6u, 5u, 4u, 3u, 0u, 0u, 0u, 15u, 14u, 13u, 12u, 11u, 0u, 0u, 0u};
+  static uint64_t shiftTable30u_2[8] = {34u, 30u, 34u, 30u, 34u, 30u, 34u, 
30u};
+  static uint64_t shiftTable30u_3[8] = {28u, 24u, 28u, 24u, 28u, 24u, 28u, 
24u};
+  static uint64_t gatherIdxTable30u[8] = {0u, 8u, 15u, 23u, 30u, 38u, 45u, 
53u};
+
+  static uint64_t nibbleReverseTable[8] = {
+      0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 
0x0F070B030D050901,
+      0x0E060A020C040800, 0x0F070B030D050901, 0x0E060A020C040800, 
0x0F070B030D050901};
+
+  static uint64_t reverseMaskTable1u[8] = {
+      0x0001020304050607, 0x08090A0B0C0D0E0F, 0x1011121314151617, 
0x18191A1B1C1D1E1F,
+      0x2021222324252627, 0x28292A2B2C2D2E2F, 0x3031323334353637, 
0x38393A3B3C3D3E3F};
+
+  static uint64_t reverseMaskTable16u[8] = {
+      0x0607040502030001, 0x0E0F0C0D0A0B0809, 0x1617141512131011, 
0x1E1F1C1D1A1B1819,
+      0x2627242522232021, 0x2E2F2C2D2A2B2829, 0x3637343532333031, 
0x3E3F3C3D3A3B3839};
+
+  static uint64_t reverseMaskTable32u[8] = {
+      0x0405060700010203, 0x0C0D0E0F08090A0B, 0x1415161710111213, 
0x1C1D1E1F18191A1B,
+      0x2425262720212223, 0x2C2D2E2F28292A2B, 0x3435363730313233, 
0x3C3D3E3F38393A3B};
+
+  uint32_t getAlign(uint32_t start_bit, uint32_t base, uint32_t bitsize) {

Review Comment:
   ```suggestion
     inline uint32_t getAlign(uint32_t start_bit, uint32_t base, uint32_t 
bitsize) {
   ```



##########
c++/src/BpackingDefault.hh:
##########
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BPACKINGDEFAULT_HH
+#define ORC_BPACKINGDEFAULT_HH
+
+#include <stdlib.h>
+#include <cstdint>
+
+#include "Bpacking.hh"
+#include "RLEv2.hh"
+#include "io/InputStream.hh"
+#include "io/OutputStream.hh"

Review Comment:
   Ditto



##########
c++/src/BpackingAvx512.hh:
##########
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BPACKINGAVX512_HH
+#define ORC_BPACKINGAVX512_HH
+
+#include <stdlib.h>
+#include <cstdint>
+
+#include "BpackingDefault.hh"
+#include "Dispatch.hh"
+#include "RLEv2.hh"
+#include "io/InputStream.hh"
+#include "io/OutputStream.hh"

Review Comment:
   Are they required or can be removed?



##########
c++/test/TestRleVectorDecoder.cc:
##########
@@ -0,0 +1,561 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdlib>
+
+#include "MemoryOutputStream.hh"
+#include "RLEv2.hh"
+#include "wrap/gtest-wrapper.h"
+#include "wrap/orc-proto-wrapper.hh"
+
+#ifdef __clang__
+DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations")
+#endif
+
+namespace orc {
+  using ::testing::TestWithParam;
+  using ::testing::Values;
+
+  const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024;  // 1M
+  const char finish = '#';
+  std::string flags = "-\\|/";
+
+  class RleV2BitUnpackAvx512Test : public TestWithParam<bool> {
+    virtual void SetUp();
+
+   protected:
+    bool alignBitpacking;
+    std::unique_ptr<RleEncoder> getEncoder(RleVersion version, 
MemoryOutputStream& memStream,
+                                           bool isSigned);
+
+    void runExampleTest(int64_t* inputData, uint64_t inputLength, unsigned 
char* expectedOutput,
+                        uint64_t outputLength);
+
+    void runTest(RleVersion version, uint64_t numValues, int64_t start, 
int64_t delta, bool random,
+                 bool isSigned, uint8_t bitWidth, uint64_t blockSize = 0, 
uint64_t numNulls = 0);
+  };
+
+  void vectorDecodeAndVerify(RleVersion version, const MemoryOutputStream& 
memStream, int64_t* data,
+                             uint64_t numValues, const char* notNull, uint64_t 
blockSize,
+                             bool isSinged) {
+    std::unique_ptr<RleDecoder> decoder =
+        createRleDecoder(std::unique_ptr<SeekableArrayInputStream>(new 
SeekableArrayInputStream(
+                             memStream.getData(), memStream.getLength(), 
blockSize)),
+                         isSinged, version, *getDefaultPool(), 
getDefaultReaderMetrics());
+
+    int64_t* decodedData = new int64_t[numValues];
+    decoder->next(decodedData, numValues, notNull);
+
+    for (uint64_t i = 0; i < numValues; ++i) {
+      if (!notNull || notNull[i]) {
+        EXPECT_EQ(data[i], decodedData[i]);
+      }
+    }
+
+    delete[] decodedData;
+  }
+
+  void RleV2BitUnpackAvx512Test::SetUp() {
+    alignBitpacking = GetParam();
+  }
+
+  void generateDataFolBits(uint64_t numValues, int64_t start, int64_t delta, 
bool random,

Review Comment:
   ```suggestion
     void generateDataForBits(uint64_t numValues, int64_t start, int64_t delta, 
bool random,
   ```



##########
c++/test/CMakeLists.txt:
##########
@@ -18,6 +18,10 @@ include_directories(
 
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX17_FLAGS} ${WARN_FLAGS}")
 
+if(BUILD_ENABLE_AVX512)
+  set(SIMD_TEST TestRleVectorDecoder.cc)

Review Comment:
   ```suggestion
     set(SIMD_TEST_SRCS TestRleVectorDecoder.cc)
   ```



##########
c++/src/CMakeLists.txt:
##########
@@ -184,13 +184,21 @@ set(SOURCE_FILES
   Timezone.cc
   TypeImpl.cc
   Vector.cc
-  Writer.cc)
+  Writer.cc
+  CpuInfoUtil.cc
+  BpackingDefault.cc)

Review Comment:
   Why `CpuInfoUtil.cc` is always required?



##########
c++/src/Bpacking.hh:
##########
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_BPACKING_HH
+#define ORC_BPACKING_HH
+
+#include <cstdint>
+
+#include "RLEv2.hh"

Review Comment:
   Can we use forward declaration for `RleDecoderV2` and do not include 
`RLEv2.hh` in the header?



##########
c++/src/RleDecoderV2.cc:
##########
@@ -17,26 +17,32 @@
  */
 
 #include "Adaptor.hh"
+// #include "Bpacking.hh"

Review Comment:
   Remove it



##########
c++/src/CMakeLists.txt:
##########
@@ -184,13 +184,21 @@ set(SOURCE_FILES
   Timezone.cc
   TypeImpl.cc
   Vector.cc
-  Writer.cc)
+  Writer.cc
+  CpuInfoUtil.cc
+  BpackingDefault.cc)

Review Comment:
   Please sort them alphabetically.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to