[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

rip-nsk Fri, 25 May 2018 08:02:24 -0700

Github user rip-nsk commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r190921029
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t 
length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each 
values.
    +        // maximum number of bits that can encoded is 32 (refer 
FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    +        // compute the histogram
    +        for(size_t i = offset; i < (offset + length); i++) {
    +            uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
    +            histgram[idx] += 1;
    +        }
    +    }
    +
    +    int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * 
(1.0 - p));
    +
    +    // return the bits required by pth percentile length
    +    for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
    +        perLen -= histgram[i];
    +        if (perLen < 0) {
    +            return decodeBitWidth(static_cast<uint32_t>(i));
    +        }
    +    }
    +
    +    return 0;
    +}
    +
    +RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
    +                           bool hasSigned, bool alignBitPacking) :
    +        RleEncoder(std::move(outStream), hasSigned),
    +        alignedBitPacking(alignBitPacking),
    +        prevDelta(0){
    +    literals = new int64_t[MAX_LITERAL_SIZE];
    +    gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
    +    zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
    +    baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
    +    adjDeltas = new int64_t[MAX_LITERAL_SIZE];
    +}
    +
    +void RleEncoderV2::write(int64_t val) {
    +    if(numLiterals == 0) {
    +        initializeLiterals(val);
    +        return;
    +    }
    +
    +    if(numLiterals == 1) {
    +        prevDelta = val - literals[0];
    +        literals[numLiterals++] = val;
    +
    +        if(val == literals[0]) {
    +            fixedRunLength = 2;
    +            variableRunLength = 0;
    +        } else {
    +            fixedRunLength = 0;
    +            variableRunLength = 2;
    +        }
    +        return;
    +    }
    +
    +    int64_t currentDelta = val - literals[numLiterals - 1];
    +    EncodingOption option = {};
    +    if (prevDelta == 0 && currentDelta == 0) {
    +        // case 1: fixed delta run
    +        literals[numLiterals++] = val;
    +
    +        if (variableRunLength > 0) {
    +            // if variable run is non-zero then we are seeing repeating
    +            // values at the end of variable run in which case fixed Run
    +            // length is 2
    +            fixedRunLength = 2;
    +        }
    +        fixedRunLength++;
    +
    +        // if fixed run met the minimum condition and if variable
    +        // run is non-zero then flush the variable run and shift the
    +        // tail fixed runs to start of the buffer
    +        if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
    +            numLiterals -= MIN_REPEAT;
    +            variableRunLength -= (MIN_REPEAT - 1);
    +
    +            int64_t tailVals[MIN_REPEAT] = {0};
    +
    +            memcpy(tailVals, literals + numLiterals, sizeof(int64_t) * 
MIN_REPEAT);
    +            determineEncoding(option);
    +            writeValues(option);
    +
    +            // shift tail fixed runs to beginning of the buffer
    +            for (size_t i = 0; i < MIN_REPEAT; ++i) {
    +                literals[numLiterals++] = tailVals[i];
    +            }
    +        }
    +
    +        if (fixedRunLength == MAX_LITERAL_SIZE) {;
    +            determineEncoding(option);
    +            writeValues(option);
    +        }
    +        return;
    +    }
    +
    +    // case 2: variable delta run
    +
    +    // if fixed run length is non-zero and if it satisfies the
    +    // short repeat conditions then write the values as short repeats
    +    // else use delta encoding
    +    if (fixedRunLength >= MIN_REPEAT) {
    +        if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
    +            option.encoding = SHORT_REPEAT;
    +        } else {
    +            option.encoding = DELTA;
    +            option.isFixedDelta = true;
    +        }
    +        writeValues(option);
    +    }
    +
    +    // if fixed run length is <MIN_REPEAT and current value is
    +    // different from previous then treat it as variable run
    +    if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != 
literals[numLiterals - 1]) {
    +        variableRunLength = fixedRunLength;
    +        fixedRunLength = 0;
    +    }
    +
    +    // after writing values re-initialize the variables
    +    if (numLiterals == 0) {
    +        initializeLiterals(val);
    +    } else {
    +        prevDelta = val - literals[numLiterals - 1];
    +        literals[numLiterals++] = val;
    +        variableRunLength++;
    +
    +        if (variableRunLength == MAX_LITERAL_SIZE) {
    +            determineEncoding(option);
    +            writeValues(option);
    +        }
    +    }
    +}
    +
    +void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
    +    int64_t zzEncVal = 0;
    +    for (size_t i = 0; i < numLiterals; i++) {
    +        if (isSigned) {
    +            zzEncVal = zigZag(literals[i]);
    +        } else {
    +            zzEncVal = literals[i];
    +        }
    +        zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
    +    }
    +}
    +
    +void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
    +    // mask will be max value beyond which patch will be generated
    +    int64_t mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
    +
    +    // since we are considering only 95 percentile, the size of gap and
    +    // patch array can contain only be 5% values
    +    option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 
20)));
    +
    +    // #bit for patch
    +    option.patchWidth = option.brBits100p - option.brBits95p;
    +    option.patchWidth = getClosestFixedBits(option.patchWidth);
    +
    +    // if patch bit requirement is 64 then it will not possible to pack
    +    // gap and patch together in a long. To make sure gap and patch can be
    +    // packed together adjust the patch width
    +    if (option.patchWidth == 64) {
    +        option.patchWidth = 56;
    +        option.brBits95p = 8;
    +        mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
    --- End diff --
    
    (static_cast<int64_t>(1) << option.brBits95p)

---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Reply via email to