HuaHuaY commented on code in PR #47294: URL: https://github.com/apache/arrow/pull/47294#discussion_r2306668207
########## cpp/src/arrow/util/bit_util.h: ########## @@ -365,5 +365,118 @@ void PackBits(const uint32_t* values, uint8_t* out) { } } +constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 7); } + +template <typename Int> +constexpr int64_t MaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8); + +/// Write a integer as LEB128 +/// +/// Write the input value as LEB128 into the outptu buffer and return the number of bytes +/// written. +/// If the output buffer size is insufficient, return 0 but the output may have been +/// written to. +/// +/// \see https://en.wikipedia.org/wiki/LEB128 +/// \see MaxLEB128ByteLenFor +template <typename Int> +constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) { + constexpr Int kLow7Mask = Int(0x7F); + constexpr Int kHigh7Mask = ~kLow7Mask; + constexpr uint8_t kContinuationBit = 0x80; + + auto const out_first = out; + + // Write as many bytes as the could be for the given input + while ((value & kHigh7Mask) != Int(0)) { + // We do not have enough room to write the LEB128 + if (out - out_first >= max_out_size) { + return 0; + } + + // Write the encoded byte with continuation bit + *out = static_cast<uint8_t>(value & kLow7Mask) | kContinuationBit; + ++out; + // Shift remaining data + value >>= 7; + } + + // We do not have enough room to write the LEB128 + if (out - out_first >= max_out_size) { + return 0; + } + + // Write last non-continuing byte + *out = static_cast<uint8_t>(value & kLow7Mask); + ++out; + + return static_cast<int32_t>(out - out_first); +} + +/// Parse a leading LEB128 +/// +/// Take as input a data pointer and the maximum number of bytes that can be read from it +/// (typically the array size). +/// When a valid LEB128 is found at the start of the data, the function writes it to the +/// out pointer and return the number of bytes read. +/// Otherwise, the out pointer is unmodified and zero is returned. +/// +/// \see https://en.wikipedia.org/wiki/LEB128 +/// \see MaxLEB128ByteLenFor +template <typename Int> +constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t max_data_size, + Int* out) { + constexpr auto kMaxBytes = static_cast<int32_t>(MaxLEB128ByteLenFor<Int>); + static_assert(kMaxBytes >= 1); + constexpr uint8_t kLow7Mask = 0x7F; + constexpr uint8_t kContinuationBit = 0x80; + constexpr int32_t kSignBitCount = std::is_signed_v<Int> ? 1 : 0; + // Number of bits allowed for encoding data on the last byte to avoid overflow + constexpr uint8_t kHighBitCount = (8 * sizeof(Int) - kSignBitCount) % 7; + // kHighBitCount least significant `0` bits and the rest with `1` + constexpr uint8_t kHighForbiddenMask = ~((1 << kHighBitCount) - 1); + + // Iteratively building the value + std::make_unsigned_t<Int> value = 0; + + // Read as many bytes as the could be for the give output + for (int32_t i = 0; i < kMaxBytes - 1; i++) { + // We have not finished reading a valid LEB128, yet we run out of data + if (ARROW_PREDICT_FALSE(i >= max_data_size)) { + return 0; + } + + // Read the byte and set its 7 LSB to in the final value + uint8_t const byte = data[i]; + value |= static_cast<Int>(byte & kLow7Mask) << (7 * i); + + // Check for lack of continuation flag in MSB + if ((byte & kContinuationBit) == 0) { + *out = value; + return i + 1; + } + } + + // Process the last index avoiding overflowing + constexpr int32_t last = kMaxBytes - 1; + + // We have not finished reading a valid LEB128, yet we run out of data + if (ARROW_PREDICT_FALSE(last >= max_data_size)) { + return 0; + } + + uint8_t const byte = data[last]; + + // Need to check if there are bits that would overflow the output. + // Also checks that there is no continuation. + if (ARROW_PREDICT_FALSE((byte & kHighForbiddenMask) != 0)) { Review Comment: In my opinion, due to right shift of `uint64_t`, high bit remains bit0 when original `BitWriter::PutVlqInt(uint64_t v)` writes the last byte. In which case should we treat the last byte specially in `WriteLEB128` and `ParseLeadingLEB128 `? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org