Github user rip-nsk commented on a diff in the pull request:
https://github.com/apache/orc/pull/273#discussion_r190921029
--- Diff: c++/src/RleEncoderV2.cc ---
@@ -0,0 +1,768 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with option work for additional information
+ * regarding copyright ownership. The ASF licenses option file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use option file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "RLEv2.hh"
+#include "RLEV2Util.hh"
+
+#define MAX_LITERAL_SIZE 512
+#define MAX_SHORT_REPEAT_LENGTH 10
+
+namespace orc {
+
+/**
+ * Compute the bits required to represent pth percentile value
+ * @param data - array
+ * @param p - percentile value (>=0.0 to <=1.0)
+ * @return pth percentile bits
+ */
+uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t
length, double p, bool reuseHist) {
+ if ((p > 1.0) || (p <= 0.0)) {
+ throw InvalidArgument("Invalid p value: " + std::to_string(p));
+ }
+
+ if (!reuseHist) {
+ // histogram that store the encoded bit requirement for each
values.
+ // maximum number of bits that can encoded is 32 (refer
FixedBitSizes)
+ memset(histgram, 0, 32 * sizeof(int32_t));
+ // compute the histogram
+ for(size_t i = offset; i < (offset + length); i++) {
+ uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
+ histgram[idx] += 1;
+ }
+ }
+
+ int32_t perLen = static_cast<int32_t>(static_cast<double>(length) *
(1.0 - p));
+
+ // return the bits required by pth percentile length
+ for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
+ perLen -= histgram[i];
+ if (perLen < 0) {
+ return decodeBitWidth(static_cast<uint32_t>(i));
+ }
+ }
+
+ return 0;
+}
+
+RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
+ bool hasSigned, bool alignBitPacking) :
+ RleEncoder(std::move(outStream), hasSigned),
+ alignedBitPacking(alignBitPacking),
+ prevDelta(0){
+ literals = new int64_t[MAX_LITERAL_SIZE];
+ gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
+ zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
+ baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
+ adjDeltas = new int64_t[MAX_LITERAL_SIZE];
+}
+
+void RleEncoderV2::write(int64_t val) {
+ if(numLiterals == 0) {
+ initializeLiterals(val);
+ return;
+ }
+
+ if(numLiterals == 1) {
+ prevDelta = val - literals[0];
+ literals[numLiterals++] = val;
+
+ if(val == literals[0]) {
+ fixedRunLength = 2;
+ variableRunLength = 0;
+ } else {
+ fixedRunLength = 0;
+ variableRunLength = 2;
+ }
+ return;
+ }
+
+ int64_t currentDelta = val - literals[numLiterals - 1];
+ EncodingOption option = {};
+ if (prevDelta == 0 && currentDelta == 0) {
+ // case 1: fixed delta run
+ literals[numLiterals++] = val;
+
+ if (variableRunLength > 0) {
+ // if variable run is non-zero then we are seeing repeating
+ // values at the end of variable run in which case fixed Run
+ // length is 2
+ fixedRunLength = 2;
+ }
+ fixedRunLength++;
+
+ // if fixed run met the minimum condition and if variable
+ // run is non-zero then flush the variable run and shift the
+ // tail fixed runs to start of the buffer
+ if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
+ numLiterals -= MIN_REPEAT;
+ variableRunLength -= (MIN_REPEAT - 1);
+
+ int64_t tailVals[MIN_REPEAT] = {0};
+
+ memcpy(tailVals, literals + numLiterals, sizeof(int64_t) *
MIN_REPEAT);
+ determineEncoding(option);
+ writeValues(option);
+
+ // shift tail fixed runs to beginning of the buffer
+ for (size_t i = 0; i < MIN_REPEAT; ++i) {
+ literals[numLiterals++] = tailVals[i];
+ }
+ }
+
+ if (fixedRunLength == MAX_LITERAL_SIZE) {;
+ determineEncoding(option);
+ writeValues(option);
+ }
+ return;
+ }
+
+ // case 2: variable delta run
+
+ // if fixed run length is non-zero and if it satisfies the
+ // short repeat conditions then write the values as short repeats
+ // else use delta encoding
+ if (fixedRunLength >= MIN_REPEAT) {
+ if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ option.encoding = SHORT_REPEAT;
+ } else {
+ option.encoding = DELTA;
+ option.isFixedDelta = true;
+ }
+ writeValues(option);
+ }
+
+ // if fixed run length is <MIN_REPEAT and current value is
+ // different from previous then treat it as variable run
+ if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val !=
literals[numLiterals - 1]) {
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
+ }
+
+ // after writing values re-initialize the variables
+ if (numLiterals == 0) {
+ initializeLiterals(val);
+ } else {
+ prevDelta = val - literals[numLiterals - 1];
+ literals[numLiterals++] = val;
+ variableRunLength++;
+
+ if (variableRunLength == MAX_LITERAL_SIZE) {
+ determineEncoding(option);
+ writeValues(option);
+ }
+ }
+}
+
+void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
+ int64_t zzEncVal = 0;
+ for (size_t i = 0; i < numLiterals; i++) {
+ if (isSigned) {
+ zzEncVal = zigZag(literals[i]);
+ } else {
+ zzEncVal = literals[i];
+ }
+ zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
+ }
+}
+
+void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
+ // mask will be max value beyond which patch will be generated
+ int64_t mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
+
+ // since we are considering only 95 percentile, the size of gap and
+ // patch array can contain only be 5% values
+ option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals /
20)));
+
+ // #bit for patch
+ option.patchWidth = option.brBits100p - option.brBits95p;
+ option.patchWidth = getClosestFixedBits(option.patchWidth);
+
+ // if patch bit requirement is 64 then it will not possible to pack
+ // gap and patch together in a long. To make sure gap and patch can be
+ // packed together adjust the patch width
+ if (option.patchWidth == 64) {
+ option.patchWidth = 56;
+ option.brBits95p = 8;
+ mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
--- End diff --
(static_cast<int64_t>(1) << option.brBits95p)
---