Github user rip-nsk commented on a diff in the pull request: https://github.com/apache/orc/pull/273#discussion_r190920599 --- Diff: c++/src/RleEncoderV2.cc --- @@ -0,0 +1,768 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with option work for additional information + * regarding copyright ownership. The ASF licenses option file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use option file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "RLEv2.hh" +#include "RLEV2Util.hh" + +#define MAX_LITERAL_SIZE 512 +#define MAX_SHORT_REPEAT_LENGTH 10 + +namespace orc { + +/** + * Compute the bits required to represent pth percentile value + * @param data - array + * @param p - percentile value (>=0.0 to <=1.0) + * @return pth percentile bits + */ +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) { + if ((p > 1.0) || (p <= 0.0)) { + throw InvalidArgument("Invalid p value: " + std::to_string(p)); + } + + if (!reuseHist) { + // histogram that store the encoded bit requirement for each values. + // maximum number of bits that can encoded is 32 (refer FixedBitSizes) + memset(histgram, 0, 32 * sizeof(int32_t)); + // compute the histogram + for(size_t i = offset; i < (offset + length); i++) { + uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); + histgram[idx] += 1; + } + } + + int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p)); + + // return the bits required by pth percentile length + for(int32_t i = HIST_LEN - 1; i >= 0; i--) { + perLen -= histgram[i]; + if (perLen < 0) { + return decodeBitWidth(static_cast<uint32_t>(i)); + } + } + + return 0; +} + +RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned, bool alignBitPacking) : + RleEncoder(std::move(outStream), hasSigned), + alignedBitPacking(alignBitPacking), + prevDelta(0){ + literals = new int64_t[MAX_LITERAL_SIZE]; + gapVsPatchList = new int64_t[MAX_LITERAL_SIZE]; + zigzagLiterals = new int64_t[MAX_LITERAL_SIZE]; + baseRedLiterals = new int64_t[MAX_LITERAL_SIZE]; + adjDeltas = new int64_t[MAX_LITERAL_SIZE]; +} + +void RleEncoderV2::write(int64_t val) { + if(numLiterals == 0) { + initializeLiterals(val); + return; + } + + if(numLiterals == 1) { + prevDelta = val - literals[0]; + literals[numLiterals++] = val; + + if(val == literals[0]) { + fixedRunLength = 2; + variableRunLength = 0; + } else { + fixedRunLength = 0; + variableRunLength = 2; + } + return; + } + + int64_t currentDelta = val - literals[numLiterals - 1]; + EncodingOption option = {}; + if (prevDelta == 0 && currentDelta == 0) { + // case 1: fixed delta run + literals[numLiterals++] = val; + + if (variableRunLength > 0) { + // if variable run is non-zero then we are seeing repeating + // values at the end of variable run in which case fixed Run + // length is 2 + fixedRunLength = 2; + } + fixedRunLength++; + + // if fixed run met the minimum condition and if variable + // run is non-zero then flush the variable run and shift the + // tail fixed runs to start of the buffer + if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { + numLiterals -= MIN_REPEAT; + variableRunLength -= (MIN_REPEAT - 1); + + int64_t tailVals[MIN_REPEAT] = {0}; + + memcpy(tailVals, literals + numLiterals, sizeof(int64_t) * MIN_REPEAT); + determineEncoding(option); + writeValues(option); + + // shift tail fixed runs to beginning of the buffer + for (size_t i = 0; i < MIN_REPEAT; ++i) { + literals[numLiterals++] = tailVals[i]; + } + } + + if (fixedRunLength == MAX_LITERAL_SIZE) {; + determineEncoding(option); + writeValues(option); + } + return; + } + + // case 2: variable delta run + + // if fixed run length is non-zero and if it satisfies the + // short repeat conditions then write the values as short repeats + // else use delta encoding + if (fixedRunLength >= MIN_REPEAT) { + if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + } + writeValues(option); + } + + // if fixed run length is <MIN_REPEAT and current value is + // different from previous then treat it as variable run + if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) { + variableRunLength = fixedRunLength; + fixedRunLength = 0; + } + + // after writing values re-initialize the variables + if (numLiterals == 0) { + initializeLiterals(val); + } else { + prevDelta = val - literals[numLiterals - 1]; + literals[numLiterals++] = val; + variableRunLength++; + + if (variableRunLength == MAX_LITERAL_SIZE) { + determineEncoding(option); + writeValues(option); + } + } +} + +void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) { + int64_t zzEncVal = 0; + for (size_t i = 0; i < numLiterals; i++) { + if (isSigned) { + zzEncVal = zigZag(literals[i]); + } else { + zzEncVal = literals[i]; + } + zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal; + } +} + +void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { + // mask will be max value beyond which patch will be generated + int64_t mask = static_cast<int64_t>(1LU << option.brBits95p) - 1; --- End diff -- (static_cast<int64_t>(1) << option.brBits95p)
---